GitOrigin-RevId: 4c43a149f8
tags/v1.11.0
| @@ -0,0 +1,29 @@ | |||||
| #include "src/arm_common/adaptive_pooling/opr_impl.h" | |||||
| #include "src/common/opr_delegate.h" | |||||
| #include "src/naive/handle.h" | |||||
| namespace megdnn { | |||||
| namespace arm_common { | |||||
| void AdaptivePoolingImpl::exec( | |||||
| _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | |||||
| auto adapt_fwd = [=]() { | |||||
| auto opr = inplace_cpu_handle()->create_operator<PoolingForward>(); | |||||
| opr->param() = deduce_pooling_param(src.layout, dst.layout); | |||||
| opr->exec(src, dst, workspace); | |||||
| }; | |||||
| MEGDNN_DISPATCH_CPU_KERN_OPR(adapt_fwd()); | |||||
| return; | |||||
| } | |||||
| size_t AdaptivePoolingImpl::get_workspace_in_bytes( | |||||
| const TensorLayout& src, const TensorLayout& dst) { | |||||
| auto opr = inplace_cpu_handle()->create_operator<PoolingForward>(); | |||||
| opr->param() = deduce_pooling_param(src, dst); | |||||
| auto need_size = opr->get_workspace_in_bytes(src, dst); | |||||
| return need_size; | |||||
| } | |||||
| } // namespace arm_common | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,20 @@ | |||||
| #pragma once | |||||
| #include "megdnn/oprs.h" | |||||
| namespace megdnn { | |||||
| namespace arm_common { | |||||
| class AdaptivePoolingImpl final : public AdaptivePoolingForward { | |||||
| public: | |||||
| using AdaptivePoolingForward::AdaptivePoolingForward; | |||||
| void exec( | |||||
| _megdnn_tensor_in src, _megdnn_tensor_out dst, | |||||
| _megdnn_workspace workspace) override; | |||||
| size_t get_workspace_in_bytes( | |||||
| const TensorLayout& src, const TensorLayout& dst) override; | |||||
| }; | |||||
| } // namespace arm_common | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -2,6 +2,7 @@ | |||||
| #include "src/arm_common/handle.h" | #include "src/arm_common/handle.h" | ||||
| #include "src/arm_common/adaptive_pooling/opr_impl.h" | |||||
| #include "src/arm_common/conv_bias/opr_impl.h" | #include "src/arm_common/conv_bias/opr_impl.h" | ||||
| #include "src/arm_common/convolution/opr_impl.h" | #include "src/arm_common/convolution/opr_impl.h" | ||||
| #include "src/arm_common/cvt_color/opr_impl.h" | #include "src/arm_common/cvt_color/opr_impl.h" | ||||
| @@ -45,6 +46,7 @@ MEGDNN_SPECIALIZE_CREATE_OPERATOR(ConvolutionBackwardData) | |||||
| MEGDNN_SPECIALIZE_CREATE_OPERATOR(RNNCell) | MEGDNN_SPECIALIZE_CREATE_OPERATOR(RNNCell) | ||||
| MEGDNN_SPECIALIZE_CREATE_OPERATOR(LSTMCell) | MEGDNN_SPECIALIZE_CREATE_OPERATOR(LSTMCell) | ||||
| MEGDNN_SPECIALIZE_CREATE_OPERATOR(LSTM) | MEGDNN_SPECIALIZE_CREATE_OPERATOR(LSTM) | ||||
| MEGDNN_SPECIALIZE_CREATE_OPERATOR(AdaptivePooling) | |||||
| #pragma GCC diagnostic push | #pragma GCC diagnostic push | ||||
| #pragma GCC diagnostic ignored "-Wpragmas" | #pragma GCC diagnostic ignored "-Wpragmas" | ||||
| @@ -2,6 +2,7 @@ | |||||
| #include "src/arm_common/pooling/algo.h" | #include "src/arm_common/pooling/algo.h" | ||||
| #include "src/common/algo_chooser.h" | #include "src/common/algo_chooser.h" | ||||
| #include "src/common/metahelper.h" | #include "src/common/metahelper.h" | ||||
| #include "src/common/opr_delegate.h" | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace arm_common; | using namespace arm_common; | ||||
| @@ -48,10 +49,72 @@ public: | |||||
| }; | }; | ||||
| PoolingImpl::AlgoPack PoolingImpl::sm_algo_pack; | PoolingImpl::AlgoPack PoolingImpl::sm_algo_pack; | ||||
| namespace { | |||||
| TensorLayout merge_hw_layout(TensorLayout src) { | |||||
| src.ndim -= 1; | |||||
| src.shape[2] = src.shape[2] * src.shape[3]; | |||||
| src.stride[2] = src.stride[3]; | |||||
| for (size_t i = 3; i < src.ndim; ++i) { | |||||
| src.shape[i] = src.shape[i + 1]; | |||||
| src.stride[i] = src.stride[i + 1]; | |||||
| } | |||||
| return src; | |||||
| } | |||||
| std::pair<TensorND, TensorND> get_gloabl_pooling_reduce_tensor( | |||||
| const TensorND& src, const TensorND& dst) { | |||||
| auto reduce_src_layout = merge_hw_layout(src.layout); | |||||
| auto reduce_dst_layout = merge_hw_layout(dst.layout); | |||||
| return std::make_pair<TensorND, TensorND>( | |||||
| {src.raw_ptr(), reduce_src_layout}, {dst.raw_ptr(), reduce_dst_layout}); | |||||
| } | |||||
| std::unique_ptr<Reduce> get_global_pooling_reduce_opr( | |||||
| Handle* handle, const PoolingImpl::PoolingKernSizeParam& param) { | |||||
| std::unique_ptr<Reduce> opr; | |||||
| if (handle) { | |||||
| opr = handle->create_operator<Reduce>(); | |||||
| } else { | |||||
| opr = inplace_cpu_handle()->create_operator<Reduce>(); | |||||
| } | |||||
| param::Reduce reduce_param; | |||||
| reduce_param.axis = 2; | |||||
| if (param.mode == PoolingImpl::Param::Mode::MAX) { | |||||
| reduce_param.mode = param::Reduce::Mode::MAX; | |||||
| } else { | |||||
| megdnn_assert( | |||||
| param.mode == PoolingImpl::Param::Mode::AVERAGE || | |||||
| param.mode == PoolingImpl::Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING); | |||||
| reduce_param.mode = param::Reduce::Mode::MEAN; | |||||
| } | |||||
| opr->param() = reduce_param; | |||||
| return opr; | |||||
| } | |||||
| bool is_global_pooling_reduce(PoolingImpl::PoolingKernSizeParam& param) { | |||||
| bool fmt_ok = param.format == PoolingImpl::Param::Format::NCHW || | |||||
| param.format == PoolingImpl::Param::Format::NCHW44 || | |||||
| param.format == PoolingImpl::Param::Format::NCHW88; | |||||
| bool size_ok = param.filter[0] == param.isz[0] && param.filter[1] == param.isz[1] && | |||||
| param.padding[0] == 0 && param.padding[1] == 0 && | |||||
| param.osz[0] == 1 && param.osz[1] == 1; | |||||
| bool dtype_ok = param.src_type == param.dst_type && | |||||
| param.src_type.enumv() != DTypeEnum::Int8; | |||||
| return fmt_ok && size_ok && dtype_ok; | |||||
| } | |||||
| } // namespace | |||||
| size_t PoolingImpl::get_workspace_in_bytes( | size_t PoolingImpl::get_workspace_in_bytes( | ||||
| const TensorLayout& src, const TensorLayout& dst) { | const TensorLayout& src, const TensorLayout& dst) { | ||||
| auto param = make_pooling_kern_szie_param(this, src, dst); | auto param = make_pooling_kern_szie_param(this, src, dst); | ||||
| bool fwd_reduce = is_global_pooling_reduce(param); | |||||
| if (fwd_reduce) { | |||||
| TensorND src_tensor{nullptr, src}; | |||||
| TensorND dst_tensor{nullptr, dst}; | |||||
| auto reduce_tensor = get_gloabl_pooling_reduce_tensor(src_tensor, dst_tensor); | |||||
| auto&& opr = get_global_pooling_reduce_opr(nullptr, param); | |||||
| auto reduce_need = opr->get_workspace_in_bytes( | |||||
| reduce_tensor.first.layout, reduce_tensor.second.layout); | |||||
| return reduce_need; | |||||
| } | |||||
| auto algo = get_algorithm(this, src, dst); | auto algo = get_algorithm(this, src, dst); | ||||
| if (!is_fallback_algo(algo)) { | if (!is_fallback_algo(algo)) { | ||||
| size_t arm_common_workspace = 0; | size_t arm_common_workspace = 0; | ||||
| @@ -93,6 +156,18 @@ void PoolingImpl::exec( | |||||
| _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | ||||
| check_exec(src.layout, dst.layout, workspace.size); | check_exec(src.layout, dst.layout, workspace.size); | ||||
| auto param = make_pooling_kern_param(this, src, dst, workspace); | auto param = make_pooling_kern_param(this, src, dst, workspace); | ||||
| bool fwd_reduce = is_global_pooling_reduce(param); | |||||
| if (fwd_reduce) { | |||||
| auto global_pooling_fwd = [=]() { | |||||
| auto reduce_tensor = get_gloabl_pooling_reduce_tensor(src, dst); | |||||
| auto&& opr = get_global_pooling_reduce_opr(nullptr, param); | |||||
| opr->exec(reduce_tensor.first, reduce_tensor.second, workspace); | |||||
| }; | |||||
| MEGDNN_DISPATCH_CPU_KERN_OPR(global_pooling_fwd()); | |||||
| return; | |||||
| } | |||||
| auto algo = get_algorithm(this, src.layout, dst.layout); | auto algo = get_algorithm(this, src.layout, dst.layout); | ||||
| if (!is_fallback_algo(algo)) { | if (!is_fallback_algo(algo)) { | ||||
| algo->exec(param); | algo->exec(param); | ||||
| @@ -8,7 +8,9 @@ param::Pooling AdaptivePoolingBase::deduce_pooling_param( | |||||
| const TensorLayout& src, const TensorLayout& dst) { | const TensorLayout& src, const TensorLayout& dst) { | ||||
| auto param_format = param().format; | auto param_format = param().format; | ||||
| size_t IH, IW, OH, OW; | size_t IH, IW, OH, OW; | ||||
| if (param_format == param::AdaptivePooling::Format::NCHW) { | |||||
| if (param_format == param::AdaptivePooling::Format::NCHW || | |||||
| param_format == param::AdaptivePooling::Format::NCHW44 || | |||||
| param_format == param::AdaptivePooling::Format::NCHW88) { | |||||
| IH = src.shape[2]; | IH = src.shape[2]; | ||||
| IW = src.shape[3]; | IW = src.shape[3]; | ||||
| OH = dst.shape[2]; | OH = dst.shape[2]; | ||||
| @@ -19,7 +21,8 @@ param::Pooling AdaptivePoolingBase::deduce_pooling_param( | |||||
| OH = dst.shape[1]; | OH = dst.shape[1]; | ||||
| OW = dst.shape[2]; | OW = dst.shape[2]; | ||||
| } else { | } else { | ||||
| megdnn_throw("AdaptivePooling only support NCHW or NHWC format"); | |||||
| megdnn_throw( | |||||
| "AdaptivePooling only support NCHW or NHWC or NCHW44 or NCHW88 format"); | |||||
| } | } | ||||
| param::Pooling ret; | param::Pooling ret; | ||||
| @@ -140,7 +140,9 @@ void PoolingForward::check_exec( | |||||
| const TensorLayout& src, const TensorLayout& dst, size_t workspace_in_bytes) { | const TensorLayout& src, const TensorLayout& dst, size_t workspace_in_bytes) { | ||||
| check_layout_fwd(src, dst); | check_layout_fwd(src, dst); | ||||
| auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); | auto required_workspace_in_bytes = get_workspace_in_bytes(src, dst); | ||||
| megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); | |||||
| megdnn_assert( | |||||
| workspace_in_bytes >= required_workspace_in_bytes, "need %zu, get %zu", | |||||
| required_workspace_in_bytes, workspace_in_bytes); | |||||
| } | } | ||||
| void PoolingBackward::check_exec( | void PoolingBackward::check_exec( | ||||
| @@ -6,11 +6,17 @@ | |||||
| namespace megdnn { | namespace megdnn { | ||||
| namespace naive { | namespace naive { | ||||
| size_t AdaptivePoolingForwardImpl::get_workspace_in_bytes( | |||||
| const TensorLayout& src, const TensorLayout& dst) { | |||||
| auto opr = inplace_cpu_handle(2)->create_operator<PoolingForward>(); | |||||
| opr->param() = deduce_pooling_param(src, dst); | |||||
| auto need_size = opr->get_workspace_in_bytes(src, dst); | |||||
| return need_size; | |||||
| } | |||||
| void AdaptivePoolingForwardImpl::exec( | void AdaptivePoolingForwardImpl::exec( | ||||
| _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { | ||||
| MEGDNN_DISPATCH_CPU_KERN(static_cast<naive::HandleImpl*>(handle()), { | MEGDNN_DISPATCH_CPU_KERN(static_cast<naive::HandleImpl*>(handle()), { | ||||
| auto opr = inplace_cpu_handle()->create_operator<PoolingForward>(); | |||||
| auto opr = inplace_cpu_handle(2)->create_operator<PoolingForward>(); | |||||
| opr->param() = deduce_pooling_param(src.layout, dst.layout); | opr->param() = deduce_pooling_param(src.layout, dst.layout); | ||||
| opr->exec(src, dst, workspace); | opr->exec(src, dst, workspace); | ||||
| }); | }); | ||||
| @@ -20,7 +26,7 @@ void AdaptivePoolingBackwardImpl::exec( | |||||
| _megdnn_tensor_in src, _megdnn_tensor_in dst, _megdnn_tensor_in diff, | _megdnn_tensor_in src, _megdnn_tensor_in dst, _megdnn_tensor_in diff, | ||||
| _megdnn_tensor_out grad, _megdnn_workspace workspace) { | _megdnn_tensor_out grad, _megdnn_workspace workspace) { | ||||
| MEGDNN_DISPATCH_CPU_KERN(static_cast<naive::HandleImpl*>(handle()), { | MEGDNN_DISPATCH_CPU_KERN(static_cast<naive::HandleImpl*>(handle()), { | ||||
| auto opr = inplace_cpu_handle()->create_operator<PoolingBackward>(); | |||||
| auto opr = inplace_cpu_handle(2)->create_operator<PoolingBackward>(); | |||||
| opr->param() = deduce_pooling_param(src.layout, dst.layout); | opr->param() = deduce_pooling_param(src.layout, dst.layout); | ||||
| opr->exec(src, dst, diff, grad, workspace); | opr->exec(src, dst, diff, grad, workspace); | ||||
| }); | }); | ||||
| @@ -29,7 +35,7 @@ void AdaptivePoolingBackwardImpl::exec( | |||||
| size_t AdaptivePoolingBackwardImpl::get_workspace_in_bytes( | size_t AdaptivePoolingBackwardImpl::get_workspace_in_bytes( | ||||
| const TensorLayout& src, const TensorLayout& dst, const TensorLayout& diff, | const TensorLayout& src, const TensorLayout& dst, const TensorLayout& diff, | ||||
| const TensorLayout& grad) { | const TensorLayout& grad) { | ||||
| auto opr = inplace_cpu_handle()->create_operator<PoolingBackward>(); | |||||
| auto opr = inplace_cpu_handle(2)->create_operator<PoolingBackward>(); | |||||
| opr->param() = deduce_pooling_param(src, dst); | opr->param() = deduce_pooling_param(src, dst); | ||||
| return opr->get_workspace_in_bytes(src, dst, diff, grad); | return opr->get_workspace_in_bytes(src, dst, diff, grad); | ||||
| } | } | ||||
| @@ -11,9 +11,7 @@ public: | |||||
| void exec( | void exec( | ||||
| _megdnn_tensor_in src, _megdnn_tensor_out dst, | _megdnn_tensor_in src, _megdnn_tensor_out dst, | ||||
| _megdnn_workspace workspace) override; | _megdnn_workspace workspace) override; | ||||
| size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&) override { | |||||
| return 0; | |||||
| } | |||||
| size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&) override; | |||||
| }; | }; | ||||
| class AdaptivePoolingBackwardImpl : public AdaptivePoolingBackward { | class AdaptivePoolingBackwardImpl : public AdaptivePoolingBackward { | ||||
| @@ -0,0 +1,98 @@ | |||||
| #include "test/arm_common/fixture.h" | |||||
| #include "megdnn/tensor_iter.h" | |||||
| #include "src/common/utils.h" | |||||
| #include "test/common/adaptive_pooling.h" | |||||
| #include "test/common/benchmarker.h" | |||||
| #include "test/common/checker.h" | |||||
| namespace megdnn { | |||||
| namespace test { | |||||
| TEST_F(ARM_COMMON, ADAPTIVE_POOLING_FORWARD_NCHW44) { | |||||
| auto args = adaptive_pooling::get_args_nchw44(); | |||||
| Checker<AdaptivePooling> checker(handle()); | |||||
| checker.set_epsilon(1e-4); | |||||
| for (DType dtype : {(DType)dtype::Float32(), (DType)dtype::QuantizedS8(1.0)}) | |||||
| for (auto&& arg : args) { | |||||
| auto param = arg.param; | |||||
| auto src = arg.ishape; | |||||
| auto dst = arg.oshape; | |||||
| checker.set_param(param).set_dtype(0, dtype).set_dtype(1, dtype).exec( | |||||
| TensorShapeArray{src, dst, {}}); | |||||
| } | |||||
| } | |||||
| TEST_F(ARM_COMMON, ADAPTIVE_POOLING_FORWARD) { | |||||
| auto args = adaptive_pooling::get_args(); | |||||
| Checker<AdaptivePooling> checker(handle()); | |||||
| checker.set_epsilon(1e-4); | |||||
| for (DType dtype : {(DType)dtype::Float32(), (DType)dtype::QuantizedS8(1.0)}) | |||||
| for (auto&& arg : args) { | |||||
| auto param = arg.param; | |||||
| auto src = arg.ishape; | |||||
| auto dst = arg.oshape; | |||||
| checker.set_param(param).set_dtype(0, dtype).set_dtype(1, dtype).exec( | |||||
| TensorShapeArray{src, dst, {}}); | |||||
| } | |||||
| } | |||||
| #if MEGDNN_WITH_BENCHMARK | |||||
| namespace { | |||||
| void benchmark_globalpooling_nchw44_fp32(Handle* handle) { | |||||
| using Param = param::AdaptivePooling; | |||||
| auto run = [&](size_t n, size_t c, size_t h, size_t w, Param::Mode mode) { | |||||
| Param param; | |||||
| param.format = Param::Format::NCHW; | |||||
| param.mode = mode; | |||||
| TensorShape nchw_shape = {n, c, h, w}; | |||||
| TensorShape nchw_dst_shape = {n, c, 1, 1}; | |||||
| TensorShape nchw44_shape = {n, c / 4, h, w, 4}; | |||||
| TensorShape nchw44_dst_shape = {n, c / 4, 1, 1, 4}; | |||||
| TensorLayout dst_layout; | |||||
| float calc_amount = n * c * h * w; | |||||
| Benchmarker<AdaptivePooling> benchmarker_float_nchw(handle); | |||||
| Benchmarker<AdaptivePooling> benchmarker_float_nchw44(handle); | |||||
| Benchmarker<AdaptivePooling> benchmarker_int_nchw44(handle); | |||||
| size_t RUN = 500; | |||||
| auto t1 = benchmarker_float_nchw.set_display(false) | |||||
| .set_times(RUN) | |||||
| .set_param(param) | |||||
| .exec({nchw_shape, nchw_dst_shape}); | |||||
| param.format = Param::Format::NCHW44; | |||||
| auto t2 = benchmarker_int_nchw44.set_display(false) | |||||
| .set_times(RUN) | |||||
| .set_param(param) | |||||
| .execl({{nchw44_shape, dtype::QuantizedS8(1.0)}, | |||||
| {nchw44_dst_shape, dtype::QuantizedS8(1.0)}}); | |||||
| auto t3 = benchmarker_float_nchw44.set_display(false) | |||||
| .set_times(RUN) | |||||
| .set_param(param) | |||||
| .exec({nchw44_shape, nchw44_dst_shape}); | |||||
| printf("{%zu %zu %zu %zu} \n" | |||||
| "nchw_fp32={%.3f ms, %.3f Mflops}, " | |||||
| "nchw44_int={%.3f ms, %.3f Mflops}, " | |||||
| "nchw44_fp32={%.3f ms, %.3f Mflops, speed_up %f}\n\n", | |||||
| n, c, h, w, t1 / RUN, calc_amount / (t1 / RUN * 1000), t2 / RUN, | |||||
| calc_amount / (t2 / RUN * 1000), t3 / RUN, | |||||
| calc_amount / (t3 / RUN * 1000), t1 / t3); | |||||
| }; | |||||
| run(1, 128, 25, 25, param::AdaptivePooling::Mode::AVERAGE); | |||||
| } | |||||
| } // namespace | |||||
| TEST_F(ARM_COMMON, BENCHMARK_GLOBAL_POOLING_NCHW44_FP32) { | |||||
| benchmark_globalpooling_nchw44_fp32(handle()); | |||||
| } | |||||
| #endif | |||||
| } // namespace test | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -40,6 +40,36 @@ inline std::vector<TestArg> get_args() { | |||||
| return args; | return args; | ||||
| } | } | ||||
| inline std::vector<TestArg> get_args_nchw44() { | |||||
| std::vector<TestArg> args; | |||||
| using Param = param::AdaptivePooling; | |||||
| using Mode = param::AdaptivePooling::Mode; | |||||
| for (size_t i = 36; i < 40; ++i) { | |||||
| args.emplace_back( | |||||
| Param{Mode::AVERAGE, Param::Format::NCHW44}, | |||||
| TensorShape{2, 3, i, i + 1, 4}, TensorShape{2, 3, i - 4, i - 2, 4}); | |||||
| args.emplace_back( | |||||
| Param{Mode::MAX, Param::Format::NCHW44}, TensorShape{2, 3, i, i + 1, 4}, | |||||
| TensorShape{2, 3, i - 4, i - 2, 4}); | |||||
| args.emplace_back( | |||||
| Param{Mode::AVERAGE, Param::Format::NCHW44}, | |||||
| TensorShape{2, 3, i, i + 1, 4}, TensorShape{2, 3, 1, 1, 4}); | |||||
| args.emplace_back( | |||||
| Param{Mode::MAX, Param::Format::NCHW44}, TensorShape{2, 3, i, i + 1, 4}, | |||||
| TensorShape{2, 3, 1, 1, 4}); | |||||
| } | |||||
| for (size_t i = 5; i < 10; ++i) { | |||||
| args.emplace_back( | |||||
| Param{Mode::AVERAGE, Param::Format::NCHW44}, | |||||
| TensorShape{2, 3, i, i + 1, 4}, TensorShape{2, 3, i - 3, i - 2, 4}); | |||||
| args.emplace_back( | |||||
| Param{Mode::MAX, Param::Format::NCHW44}, TensorShape{2, 3, i, i + 1, 4}, | |||||
| TensorShape{2, 3, i - 3, i - 2, 4}); | |||||
| } | |||||
| return args; | |||||
| } | |||||
| } // namespace adaptive_pooling | } // namespace adaptive_pooling | ||||
| } // namespace test | } // namespace test | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| @@ -254,7 +254,9 @@ def optimize_for_inference(dest_vars, **kwargs): | |||||
| * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z | * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z | ||||
| input for inference on nvidia backend(this optimization pass will | input for inference on nvidia backend(this optimization pass will | ||||
| result in mismatch of the precision of output of training and | result in mismatch of the precision of output of training and | ||||
| inference) | |||||
| inference | |||||
| * enable_fuse_grain: fuse grain will be enable by default to fuse grain operator to huge operator, you can disable it. | |||||
| ) | |||||
| """ | """ | ||||
| inference_options = GraphOptimizeOptions() | inference_options = GraphOptimizeOptions() | ||||
| inference_optimize_layout_transform_map = { | inference_optimize_layout_transform_map = { | ||||
| @@ -282,6 +284,8 @@ def optimize_for_inference(dest_vars, **kwargs): | |||||
| inference_options.fuse_conv_bias_with_z = True | inference_options.fuse_conv_bias_with_z = True | ||||
| if kwargs.pop("enable_fuse_preprocess", False): | if kwargs.pop("enable_fuse_preprocess", False): | ||||
| inference_options.fuse_preprocess = True | inference_options.fuse_preprocess = True | ||||
| if kwargs.pop("enable_fuse_grain", True): | |||||
| inference_options.fuse_grain = True | |||||
| if kwargs: | if kwargs: | ||||
| raise ValueError("unknown options: %s" % list(kwargs)) | raise ValueError("unknown options: %s" % list(kwargs)) | ||||
| @@ -330,6 +334,8 @@ def deserialize_infer_option(x: int) -> Dict[str, bool]: | |||||
| ret["enable_fuse_conv_bias_with_z"] = True | ret["enable_fuse_conv_bias_with_z"] = True | ||||
| if inference_options.fuse_preprocess: | if inference_options.fuse_preprocess: | ||||
| ret["enable_fuse_preprocess"] = True | ret["enable_fuse_preprocess"] = True | ||||
| if inference_options.fuse_grain: | |||||
| ret["enable_fuse_grain"] = True | |||||
| return ret | return ret | ||||
| @@ -151,7 +151,9 @@ class Network: | |||||
| * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z | * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z | ||||
| input for inference on nvidia backend(this optimization pass will | input for inference on nvidia backend(this optimization pass will | ||||
| result in mismatch of the precision of output of training and | result in mismatch of the precision of output of training and | ||||
| inference) | |||||
| inference | |||||
| * enable_fuse_grain: fuse grain will be enable by default to fuse grain operator to huge operator, you can disable it. | |||||
| ) | |||||
| """ | """ | ||||
| if not isinstance(dest_vars, Sequence): | if not isinstance(dest_vars, Sequence): | ||||
| @@ -221,7 +223,6 @@ class Network: | |||||
| logger.warning( | logger.warning( | ||||
| '"output_names" is not supported in Network.dump, rename output vars directly' | '"output_names" is not supported in Network.dump, rename output vars directly' | ||||
| ) | ) | ||||
| if optimize_for_inference: | if optimize_for_inference: | ||||
| out, optimize_options = G.optimize_for_inference(out, **kwargs) | out, optimize_options = G.optimize_for_inference(out, **kwargs) | ||||
| @@ -292,7 +292,9 @@ void init_graph_rt(py::module m) { | |||||
| &_OptimizeForInferenceOptions::fuse_preprocess) | &_OptimizeForInferenceOptions::fuse_preprocess) | ||||
| .def_readwrite( | .def_readwrite( | ||||
| "layout_transform", | "layout_transform", | ||||
| &_OptimizeForInferenceOptions::layout_transform); | |||||
| &_OptimizeForInferenceOptions::layout_transform) | |||||
| .def_readwrite( | |||||
| "fuse_grain", &_OptimizeForInferenceOptions::fuse_grain); | |||||
| py::enum_<_LayoutTransform>(GraphOptimizeOptions, "LayoutTransform") | py::enum_<_LayoutTransform>(GraphOptimizeOptions, "LayoutTransform") | ||||
| .value("DEFAULT", _LayoutTransform::DEFAULT) | .value("DEFAULT", _LayoutTransform::DEFAULT) | ||||
| @@ -47,6 +47,7 @@ def test_metadata(): | |||||
| "user_info": {"str": "x", "tensor": x, "module": M.Module, "none": None}, | "user_info": {"str": "x", "tensor": x, "module": M.Module, "none": None}, | ||||
| "graph_modified": True, # True: Network.dump | "graph_modified": True, # True: Network.dump | ||||
| "optimized_for_inference": True, | "optimized_for_inference": True, | ||||
| "enable_fuse_grain": True, | |||||
| "enable_nchw4": True, | "enable_nchw4": True, | ||||
| "enable_ioc16": True, | "enable_ioc16": True, | ||||
| } | } | ||||
| @@ -76,6 +76,13 @@ void ModelMdl::make_output_spec() { | |||||
| } | } | ||||
| m_asyc_exec = m_load_result.graph_compile(m_output_spec); | m_asyc_exec = m_load_result.graph_compile(m_output_spec); | ||||
| auto new_output_vars = m_asyc_exec->get_output_vars(); | |||||
| mgb::cg::SymbolVarArray symbol_var_array; | |||||
| symbol_var_array.reserve(new_output_vars.size()); | |||||
| for (auto output_var : new_output_vars) { | |||||
| symbol_var_array.emplace_back(output_var); | |||||
| } | |||||
| m_load_result.output_var_list = symbol_var_array; | |||||
| } | } | ||||
| std::shared_ptr<mgb::serialization::GraphLoader>& ModelMdl::reset_loader( | std::shared_ptr<mgb::serialization::GraphLoader>& ModelMdl::reset_loader( | ||||
| @@ -56,6 +56,43 @@ void PackModelOption::config_model( | |||||
| RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) { | RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) { | ||||
| CONFIG_MODEL_FUN; | CONFIG_MODEL_FUN; | ||||
| } | } | ||||
| ///////////////////// RawModelOption ////////////////////////// | |||||
| std::shared_ptr<OptionBase> RawModelOption::create_option() { | |||||
| static std::shared_ptr<RawModelOption> option(new RawModelOption); | |||||
| if (RawModelOption::is_valid()) { | |||||
| return std::static_pointer_cast<OptionBase>(option); | |||||
| } else { | |||||
| return nullptr; | |||||
| } | |||||
| } | |||||
| RawModelOption::RawModelOption() { | |||||
| m_option_name = "raw_model"; | |||||
| if (!FLAGS_model_dump.empty()) | |||||
| model_dump = FLAGS_model_dump; | |||||
| } | |||||
| bool RawModelOption::is_valid() { | |||||
| return !FLAGS_model_dump.empty(); | |||||
| } | |||||
| void RawModelOption::config_model( | |||||
| RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) { | |||||
| CONFIG_MODEL_FUN; | |||||
| } | |||||
| template <typename ModelImpl> | |||||
| void RawModelOption::config_model_internel( | |||||
| RuntimeParam& runtime_param, std::shared_ptr<ModelImpl> model) { | |||||
| if (runtime_param.stage == RunStage::AFTER_MODEL_RUNNING) { | |||||
| auto model_data = model->get_model_data(); | |||||
| std::ofstream ofs(model_dump, std::ios::binary); | |||||
| if (!ofs.is_open()) { | |||||
| mgb_log_warn("can not open file %s to write model\n", model_dump.c_str()); | |||||
| return; | |||||
| } | |||||
| ofs.write((char*)model_data.data(), model_data.size()); | |||||
| ofs.close(); | |||||
| mgb_log_warn("success write model to %s\n", model_dump.c_str()); | |||||
| } | |||||
| } | |||||
| ////////////////////// PackModel gflags //////////////////////// | ////////////////////// PackModel gflags //////////////////////// | ||||
| @@ -79,4 +116,8 @@ DEFINE_string( | |||||
| "https://megengine.megvii-inc.com/user-guide/deployment/lite/advance/" | "https://megengine.megvii-inc.com/user-guide/deployment/lite/advance/" | ||||
| "pack-lite-model.html for more details."); | "pack-lite-model.html for more details."); | ||||
| REGIST_OPTION_CREATOR(pack_model, lar::PackModelOption::create_option); | |||||
| ///////////////////// RawModel gflags /////////////////////////// | |||||
| DEFINE_string(model_dump, "", "The output file path of raw model."); | |||||
| REGIST_OPTION_CREATOR(pack_model, lar::PackModelOption::create_option); | |||||
| REGIST_OPTION_CREATOR(dump_model, lar::RawModelOption::create_option); | |||||
| @@ -3,7 +3,7 @@ | |||||
| #include "megbrain/graph/operator_node.h" | #include "megbrain/graph/operator_node.h" | ||||
| #include "models/model.h" | #include "models/model.h" | ||||
| #include "option_base.h" | #include "option_base.h" | ||||
| DECLARE_string(model_dump); | |||||
| DECLARE_string(packed_model_dump); | DECLARE_string(packed_model_dump); | ||||
| DECLARE_string(pack_info_json); | DECLARE_string(pack_info_json); | ||||
| DECLARE_string(pack_cache); | DECLARE_string(pack_cache); | ||||
| @@ -36,4 +36,22 @@ private: | |||||
| std::string pack_model_cryption; | std::string pack_model_cryption; | ||||
| bool is_fast_run_cache = true; | bool is_fast_run_cache = true; | ||||
| }; | }; | ||||
| class RawModelOption : public OptionBase { | |||||
| public: | |||||
| static bool is_valid(); | |||||
| static std::shared_ptr<OptionBase> create_option(); | |||||
| void config_model( | |||||
| RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override; | |||||
| std::string option_name() const override { return m_option_name; } | |||||
| private: | |||||
| RawModelOption(); | |||||
| template <typename ModelImpl> | |||||
| void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>); | |||||
| std::string m_option_name; | |||||
| std::string model_dump; | |||||
| }; | |||||
| } // namespace lar | } // namespace lar | ||||
| @@ -124,6 +124,60 @@ void WeightPreprocessOption::config_model( | |||||
| CONFIG_MODEL_FUN; | CONFIG_MODEL_FUN; | ||||
| } | } | ||||
| ///////////////////////// fuse grain optimize options /////////////// | |||||
| bool FuseGrainOption::m_valid; | |||||
| namespace lar { | |||||
| template <> | |||||
| void FuseGrainOption::config_model_internel<ModelLite>( | |||||
| RuntimeParam& runtime_param, std::shared_ptr<ModelLite>) { | |||||
| if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) { | |||||
| if (m_fuse_grain) { | |||||
| LITE_THROW("enable fuse-grain optimization not support in lite model"); | |||||
| } | |||||
| } | |||||
| } | |||||
| template <> | |||||
| void FuseGrainOption::config_model_internel<ModelMdl>( | |||||
| RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) { | |||||
| if (runtime_param.stage == RunStage::BEFORE_MODEL_LOAD) { | |||||
| auto&& graph_option = model->get_mdl_config().comp_graph->options(); | |||||
| if (m_fuse_grain) { | |||||
| mgb_log_warn("enable fuse-grain optimization"); | |||||
| graph_option.graph_opt.enable_fuse_grain(); | |||||
| } | |||||
| } | |||||
| } | |||||
| } // namespace lar | |||||
| FuseGrainOption::FuseGrainOption() { | |||||
| m_option_name = "fuse_grain"; | |||||
| m_fuse_grain = FLAGS_fuse_grain; | |||||
| m_option = {{"fuse_grain", lar::Bool::make(false)}}; | |||||
| std::static_pointer_cast<lar::Bool>(m_option["fuse_grain"]) | |||||
| ->set_value(FLAGS_fuse_grain); | |||||
| } | |||||
| bool FuseGrainOption::is_valid() { | |||||
| return true; | |||||
| } | |||||
| std::shared_ptr<OptionBase> FuseGrainOption::create_option() { | |||||
| static std::shared_ptr<FuseGrainOption> option(new FuseGrainOption); | |||||
| if (FuseGrainOption::is_valid()) { | |||||
| return std::static_pointer_cast<OptionBase>(option); | |||||
| } else { | |||||
| return nullptr; | |||||
| } | |||||
| } | |||||
| void FuseGrainOption::config_model( | |||||
| RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) { | |||||
| m_fuse_grain = | |||||
| std::static_pointer_cast<lar::Bool>(m_option["fuse_grain"])->get_value(); | |||||
| CONFIG_MODEL_FUN; | |||||
| } | |||||
| ///// fuse conv bias and nonlinear activation opr optimize options //////// | ///// fuse conv bias and nonlinear activation opr optimize options //////// | ||||
| bool FuseConvBiasNonlinearOption::m_valid; | bool FuseConvBiasNonlinearOption::m_valid; | ||||
| namespace lar { | namespace lar { | ||||
| @@ -579,6 +633,7 @@ void TensorRTOption::config_model( | |||||
| DEFINE_bool( | DEFINE_bool( | ||||
| enable_fuse_preprocess, false, | enable_fuse_preprocess, false, | ||||
| "Fusion astype | pad_channel | dimshuffle and etc opr from h2d opr"); | "Fusion astype | pad_channel | dimshuffle and etc opr from h2d opr"); | ||||
| DEFINE_bool(fuse_grain, false, "Enable fusion grain opr to huge opr"); | |||||
| DEFINE_bool( | DEFINE_bool( | ||||
| weight_preprocess, false, | weight_preprocess, false, | ||||
| "Execute operators with weight preprocess, which can optimize the " | "Execute operators with weight preprocess, which can optimize the " | ||||
| @@ -589,7 +644,7 @@ DEFINE_bool( | |||||
| "whether to fuse conv+bias+nonlinearity"); | "whether to fuse conv+bias+nonlinearity"); | ||||
| DEFINE_bool( | DEFINE_bool( | ||||
| enable_fuse_conv_bias_with_z, false, | enable_fuse_conv_bias_with_z, false, | ||||
| "fuse conv,bias (elemwise add),z(elemwise add) into one opr " | |||||
| "fuse conv, bias (elemwise add), z(elemwise add) into one opr " | |||||
| "(only support on GPU)"); | "(only support on GPU)"); | ||||
| ///////////////////////// graph retrict options ///////////////////////// | ///////////////////////// graph retrict options ///////////////////////// | ||||
| @@ -636,6 +691,9 @@ REGIST_OPTION_VALIDATER(fuse_preprocess, lar::FusePreprocessOption::set_valid); | |||||
| REGIST_OPTION_CREATOR(weight_preprocess, lar::WeightPreprocessOption::create_option); | REGIST_OPTION_CREATOR(weight_preprocess, lar::WeightPreprocessOption::create_option); | ||||
| REGIST_OPTION_VALIDATER(weight_preprocess, lar::WeightPreprocessOption::set_valid); | REGIST_OPTION_VALIDATER(weight_preprocess, lar::WeightPreprocessOption::set_valid); | ||||
| REGIST_OPTION_CREATOR(disable_fuse_grain, lar::FuseGrainOption::create_option); | |||||
| REGIST_OPTION_VALIDATER(disable_fuse_grain, lar::FuseGrainOption::set_valid); | |||||
| REGIST_OPTION_CREATOR( | REGIST_OPTION_CREATOR( | ||||
| fuse_conv_bias_nonlinearity, lar::FuseConvBiasNonlinearOption::create_option); | fuse_conv_bias_nonlinearity, lar::FuseConvBiasNonlinearOption::create_option); | ||||
| REGIST_OPTION_VALIDATER( | REGIST_OPTION_VALIDATER( | ||||
| @@ -5,6 +5,7 @@ | |||||
| #include "option_base.h" | #include "option_base.h" | ||||
| DECLARE_bool(enable_fuse_preprocess); | DECLARE_bool(enable_fuse_preprocess); | ||||
| DECLARE_bool(fuse_grain); | |||||
| DECLARE_bool(weight_preprocess); | DECLARE_bool(weight_preprocess); | ||||
| DECLARE_bool(enable_fuse_conv_bias_nonlinearity); | DECLARE_bool(enable_fuse_conv_bias_nonlinearity); | ||||
| DECLARE_bool(enable_fuse_conv_bias_with_z); | DECLARE_bool(enable_fuse_conv_bias_with_z); | ||||
| @@ -79,7 +80,31 @@ private: | |||||
| static bool m_valid; | static bool m_valid; | ||||
| OptionValMap m_option; | OptionValMap m_option; | ||||
| }; | }; | ||||
| ///////////////////////// fuse grain options ////////////// | |||||
| class FuseGrainOption final : public OptionBase { | |||||
| public: | |||||
| static bool is_valid(); | |||||
| static std::shared_ptr<OptionBase> create_option(); | |||||
| void config_model( | |||||
| RuntimeParam& runtime_param, std::shared_ptr<ModelBase> model) override; | |||||
| std::string option_name() const override { return m_option_name; }; | |||||
| static void set_valid(bool val) { m_valid = val; }; | |||||
| OptionValMap* get_option() override { return &m_option; } | |||||
| private: | |||||
| FuseGrainOption(); | |||||
| template <typename ModelImpl> | |||||
| void config_model_internel(RuntimeParam&, std::shared_ptr<ModelImpl>){}; | |||||
| std::string m_option_name; | |||||
| bool m_fuse_grain; | |||||
| static bool m_valid; | |||||
| OptionValMap m_option; | |||||
| }; | |||||
| /////////////// fuse_conv_bias_nonlinearity optimize options /////////////// | /////////////// fuse_conv_bias_nonlinearity optimize options /////////////// | ||||
| class FuseConvBiasNonlinearOption final : public OptionBase { | class FuseConvBiasNonlinearOption final : public OptionBase { | ||||
| public: | public: | ||||
| @@ -91,7 +91,7 @@ public: | |||||
| }; | }; | ||||
| class OutputVarsUserData final : public mgb::UserDataContainer::UserData { | class OutputVarsUserData final : public mgb::UserDataContainer::UserData { | ||||
| MGB_TYPEINFO_OBJ_DECL; | |||||
| MGB_TYPEINFO_OBJ_DECL_WITH_EXPORT; | |||||
| private: | private: | ||||
| VarNodeArray m_output_vars; | VarNodeArray m_output_vars; | ||||
| @@ -91,6 +91,9 @@ struct GraphCommonOptimizeOptions { | |||||
| bool weight_preprocess = false; | bool weight_preprocess = false; | ||||
| //! fuse preprocess patten, like astype + pad_channel + dimshuffle | //! fuse preprocess patten, like astype + pad_channel + dimshuffle | ||||
| bool fuse_preprocess = false; | bool fuse_preprocess = false; | ||||
| //! fuse_grain patten, replace grain ir with huge ir | |||||
| bool fuse_grain = false; | |||||
| enum LayoutTransform : uint32_t { | enum LayoutTransform : uint32_t { | ||||
| DEFAULT, | DEFAULT, | ||||
| NCHW4, ///< compute using NCHW4 tensor format | NCHW4, ///< compute using NCHW4 tensor format | ||||
| @@ -124,6 +127,7 @@ struct GraphCommonOptimizeOptions { | |||||
| SET(fuse_conv_bias_with_z); | SET(fuse_conv_bias_with_z); | ||||
| SET(fuse_preprocess); | SET(fuse_preprocess); | ||||
| SET(weight_preprocess); | SET(weight_preprocess); | ||||
| SET(fuse_grain); | |||||
| #undef SET | #undef SET | ||||
| #define SET(_trans, _trans_capital) \ | #define SET(_trans, _trans_capital) \ | ||||
| GraphCommonOptimizeOptions& enable_##_trans() { \ | GraphCommonOptimizeOptions& enable_##_trans() { \ | ||||
| @@ -0,0 +1,180 @@ | |||||
| #include "megbrain/gopt/inference.h" | |||||
| #include "megbrain/opr/basic_arith.h" | |||||
| #include "megbrain/opr/dnn/adaptive_pooling.h" | |||||
| #include "megbrain/opr/tensor_manip.h" | |||||
| #include "megbrain/opr/utility.h" | |||||
| #include "megbrain/serialization/opr_shallow_copy.h" | |||||
| #include "megdnn/opr_param_defs.h" | |||||
| #include "megbrain/opr/internal/megdnn_opr_wrapper.h" | |||||
| #include "../../core/impl/graph/cg_impl.h" | |||||
| #include "./gopt_helper.h" | |||||
| #include "megbrain/utils/hash_ct.h" | |||||
| #include "midout.h" | |||||
| MIDOUT_DECL(megbrain_folding_global_pooling) | |||||
| #define MIDOUT_B(tag) \ | |||||
| MIDOUT_BEGIN(megbrain_folding_global_pooling, midout_iv(MGB_HASH_STR(tag))) { | |||||
| #define MIDOUT_E \ | |||||
| } \ | |||||
| MIDOUT_END(); | |||||
| using namespace mgb; | |||||
| using namespace gopt; | |||||
| /* ==================== FoldingGlobalPoolingPass ================= */ | |||||
| const char* FoldingGlobalPoolingPass::name() const { | |||||
| return mgb_cstr_log("folding reduce mean pass"); | |||||
| } | |||||
| void FoldingGlobalPoolingPass::apply(OptState& opt) const { | |||||
| MIDOUT_B("FoldingGlobalPoolingPass::apply"); | |||||
| FindNext find_tool(opt); | |||||
| auto rewriter = opt.graph().make_rewriter(); | |||||
| /** | |||||
| * | |||||
| * reshape+------>reduce(mean or max)+--->axis_add_remove*n | |||||
| * || | |||||
| * || | |||||
| * || | |||||
| * \/ | |||||
| * adaptive_pooling(1,1) | |||||
| */ | |||||
| auto try_fuse_global_pooling_axis_add = [&rewriter, | |||||
| &find_tool](OperatorNodeBase* opr) { | |||||
| ThinHashSet<OperatorNodeBase*> opr_set; | |||||
| ThinHashSet<OperatorNodeBase*> reader_set; | |||||
| MGB_MARK_USED_VAR(rewriter); | |||||
| MGB_MARK_USED_VAR(find_tool); | |||||
| auto axis_modi = try_cast_as_op<opr::AxisAddRemove>(opr); | |||||
| CHECK_OR_RETURN(axis_modi); | |||||
| CHECK_OR_RETURN(find_tool.used_count(axis_modi) <= 1); | |||||
| auto output_shape = axis_modi->output(0)->shape(); | |||||
| CHECK_OR_RETURN(output_shape.ndim == 4); | |||||
| CHECK_OR_RETURN(output_shape[2] == output_shape[3] && output_shape[2] == 1); | |||||
| auto axis_input = axis_modi->input(0)->owner_opr(); | |||||
| auto axis_modi_x = axis_input->try_cast_final<opr::AxisAddRemove>(); | |||||
| auto reduce = axis_input->try_cast_final<opr::Reduce>(); | |||||
| while (axis_modi_x) { | |||||
| CHECK_OR_RETURN(find_tool.used_count(axis_modi_x) == 1); | |||||
| auto axis_input_x = axis_modi_x->input(0)->owner_opr(); | |||||
| reduce = axis_input_x->try_cast_final<opr::Reduce>(); | |||||
| axis_modi_x = axis_input_x->try_cast_final<opr::AxisAddRemove>(); | |||||
| } | |||||
| CHECK_OR_RETURN(reduce); | |||||
| auto reduce_mode = reduce->param().mode; | |||||
| CHECK_OR_RETURN( | |||||
| reduce_mode == opr::Reduce::Param::Mode::MAX || | |||||
| reduce_mode == opr::Reduce::Param::Mode::MEAN); | |||||
| auto reduce_axis = reduce->param().axis; | |||||
| CHECK_OR_RETURN(reduce_axis == 2) | |||||
| auto reshape = reduce->input(0)->owner_opr()->try_cast_final<opr::Reshape>(); | |||||
| CHECK_OR_RETURN(reshape); | |||||
| auto reshape_in_shape = reshape->input(0)->shape(); | |||||
| auto reshape_out_shape = reshape->output(0)->shape(); | |||||
| bool merge_hw = | |||||
| reshape_out_shape.ndim == 3 && reshape_in_shape.ndim == 4 && | |||||
| reshape_in_shape[2] * reshape_in_shape[3] == reshape_out_shape[2]; | |||||
| CHECK_OR_RETURN(merge_hw); | |||||
| opr::AdaptivePooling::Param param; | |||||
| if (reduce_mode == opr::Reduce::Param::Mode::MAX) { | |||||
| param.mode = opr::AdaptivePooling::Param::Mode::MAX; | |||||
| } else { | |||||
| mgb_assert(reduce_mode == opr::Reduce::Param::Mode::MEAN); | |||||
| param.mode = opr::AdaptivePooling::Param::Mode::AVERAGE; | |||||
| } | |||||
| auto new_node = opr::AdaptivePooling::make( | |||||
| rewriter.get_var(reshape->input(0)), {1, 1}, param); | |||||
| rewriter.replace_var( | |||||
| axis_modi->output(0), new_node.node(), | |||||
| mgb_cstr_log("replace reshape+reduce+add_axis -> adaptive pooling")); | |||||
| return true; | |||||
| }; | |||||
| /** | |||||
| * | |||||
| * reshape+------>reduce(mean or max)+--->dimshuffle(0,1,-1,-1) | |||||
| * || | |||||
| * || | |||||
| * || | |||||
| * \/ | |||||
| * adaptive_pooling(1,1) | |||||
| */ | |||||
| auto try_fuse_global_pooling_dimshuffle = [&rewriter, | |||||
| &find_tool](OperatorNodeBase* opr) { | |||||
| ThinHashSet<OperatorNodeBase*> opr_set; | |||||
| ThinHashSet<OperatorNodeBase*> reader_set; | |||||
| MGB_MARK_USED_VAR(rewriter); | |||||
| MGB_MARK_USED_VAR(find_tool); | |||||
| auto dimshuffle = try_cast_as_op<opr::Dimshuffle>(opr); | |||||
| CHECK_OR_RETURN(dimshuffle); | |||||
| auto patten_param = dimshuffle->param(); | |||||
| CHECK_OR_RETURN(patten_param.pattern_len == 4); | |||||
| auto patten = patten_param.pattern; | |||||
| CHECK_OR_RETURN( | |||||
| patten[0] == 0 && patten[1] == 1 && patten[2] == -1 && patten[3] == -1); | |||||
| auto axis_remove = | |||||
| dimshuffle->input(0)->owner_opr()->try_cast_final<opr::AxisAddRemove>(); | |||||
| CHECK_OR_RETURN(axis_remove); | |||||
| auto reduce = axis_remove->input(0)->owner_opr()->try_cast_final<opr::Reduce>(); | |||||
| CHECK_OR_RETURN(reduce); | |||||
| auto reduce_mode = reduce->param().mode; | |||||
| CHECK_OR_RETURN( | |||||
| reduce_mode == opr::Reduce::Param::Mode::MAX || | |||||
| reduce_mode == opr::Reduce::Param::Mode::MEAN); | |||||
| auto reduce_axis = reduce->param().axis; | |||||
| CHECK_OR_RETURN(reduce_axis == 2) | |||||
| auto reshape = reduce->input(0)->owner_opr()->try_cast_final<opr::Reshape>(); | |||||
| CHECK_OR_RETURN(reshape); | |||||
| auto reshape_in_shape = reshape->input(0)->shape(); | |||||
| auto reshape_out_shape = reshape->output(0)->shape(); | |||||
| bool merge_hw = | |||||
| reshape_out_shape.ndim == 3 && reshape_in_shape.ndim == 4 && | |||||
| reshape_in_shape[2] * reshape_in_shape[3] == reshape_out_shape[2]; | |||||
| CHECK_OR_RETURN(merge_hw); | |||||
| opr::AdaptivePooling::Param param; | |||||
| if (reduce_mode == opr::Reduce::Param::Mode::MAX) { | |||||
| param.mode = opr::AdaptivePooling::Param::Mode::MAX; | |||||
| } else { | |||||
| mgb_assert(reduce_mode == opr::Reduce::Param::Mode::MEAN); | |||||
| param.mode = opr::AdaptivePooling::Param::Mode::AVERAGE; | |||||
| } | |||||
| auto new_node = opr::AdaptivePooling::make( | |||||
| rewriter.get_var(reshape->input(0)), {1, 1}, param); | |||||
| rewriter.replace_var( | |||||
| dimshuffle->output(0), new_node.node(), | |||||
| mgb_cstr_log("replace reshape+reduce+dimshuffle -> adaptive pooling")); | |||||
| return true; | |||||
| }; | |||||
| auto on_opr = [&try_fuse_global_pooling_axis_add, | |||||
| &try_fuse_global_pooling_dimshuffle, | |||||
| &rewriter](OperatorNodeBase* opr) { | |||||
| if (!try_fuse_global_pooling_axis_add(opr) && | |||||
| !try_fuse_global_pooling_dimshuffle(opr)) { | |||||
| rewriter.auto_replace_outputs(opr); | |||||
| } | |||||
| }; | |||||
| opt.graph().iter(on_opr); | |||||
| rewriter.apply_inplace(); | |||||
| MIDOUT_E | |||||
| } | |||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||||
| @@ -0,0 +1,117 @@ | |||||
| #include "megbrain/gopt/inference.h" | |||||
| #include "megbrain/opr/basic_arith.h" | |||||
| #include "megbrain/opr/dnn/convolution.h" | |||||
| #include "megbrain/opr/tensor_manip.h" | |||||
| #include "megbrain/opr/utility.h" | |||||
| #include "megbrain/serialization/opr_shallow_copy.h" | |||||
| #include "megdnn/opr_param_defs.h" | |||||
| #include "megbrain/opr/internal/megdnn_opr_wrapper.h" | |||||
| #include "../../core/impl/graph/cg_impl.h" | |||||
| #include "./gopt_helper.h" | |||||
| #include "megbrain/utils/hash_ct.h" | |||||
| #include "midout.h" | |||||
| MIDOUT_DECL(megbrain_folding_reduce_mean) | |||||
| #define MIDOUT_B(tag) \ | |||||
| MIDOUT_BEGIN(megbrain_folding_reduce_mean, midout_iv(MGB_HASH_STR(tag))) { | |||||
| #define MIDOUT_E \ | |||||
| } \ | |||||
| MIDOUT_END(); | |||||
| using namespace mgb; | |||||
| using namespace gopt; | |||||
| /* ==================== FoldingReduceMeanPass ================= */ | |||||
| const char* FoldingReduceMeanPass::name() const { | |||||
| return mgb_cstr_log("folding reduce mean pass"); | |||||
| } | |||||
| void FoldingReduceMeanPass::apply(OptState& opt) const { | |||||
| MIDOUT_B("FoldingReduceMeanPass::apply"); | |||||
| FindNext find_tool(opt); | |||||
| auto rewriter = opt.graph().make_rewriter(); | |||||
| /** | |||||
| * reshape+---------->reduce(axis, sum)+--------->axis_remove+----------->true_div | |||||
| * | ^ | |||||
| * | | | |||||
| * +--------------> get_var_shape(axis)+------------>type_cvt(fp32)+-------+ | |||||
| * || | |||||
| * || | |||||
| * \/ | |||||
| * reshape+-------->reduce(axis, mean)+--------->axis_remove | |||||
| * | |||||
| * | |||||
| **/ | |||||
| auto try_fuse_reduce_mean = [&rewriter, &find_tool](OperatorNodeBase* opr) { | |||||
| ThinHashSet<OperatorNodeBase*> opr_set; | |||||
| ThinHashSet<OperatorNodeBase*> reader_set; | |||||
| MGB_MARK_USED_VAR(rewriter); | |||||
| // check true_div | |||||
| auto elemwise = try_cast_as_op<opr::Elemwise>(opr); | |||||
| CHECK_OR_RETURN(elemwise); | |||||
| auto mode_ok = elemwise->param().mode == opr::Elemwise::Mode::TRUE_DIV; | |||||
| CHECK_OR_RETURN(mode_ok); | |||||
| auto input0 = elemwise->input(0)->owner_opr(); | |||||
| auto remove_axis = input0->try_cast_final<opr::AxisAddRemove>(); | |||||
| auto reduce = input0->try_cast_final<opr::Reduce>(); | |||||
| if (remove_axis) { | |||||
| reduce = remove_axis->input(0)->owner_opr()->try_cast_final<opr::Reduce>(); | |||||
| } | |||||
| CHECK_OR_RETURN(reduce); | |||||
| bool reduce_sum = reduce->param().mode == opr::Reduce::Param::Mode::SUM; | |||||
| CHECK_OR_RETURN(reduce_sum); | |||||
| auto input1 = elemwise->input(1)->owner_opr(); | |||||
| auto typecvt = input1->try_cast_final<opr::TypeCvt>(); | |||||
| CHECK_OR_RETURN(typecvt); | |||||
| auto is_typecvt_f32 = typecvt->param().enumv() == DTypeEnum::Float32; | |||||
| CHECK_OR_RETURN(is_typecvt_f32); | |||||
| auto get_var_shape = | |||||
| typecvt->input(0)->owner_opr()->try_cast_final<opr::GetVarShape>(); | |||||
| CHECK_OR_RETURN(get_var_shape); | |||||
| bool same_parent = | |||||
| get_var_shape->input(0)->owner_opr() == reduce->input(0)->owner_opr(); | |||||
| CHECK_OR_RETURN(same_parent); | |||||
| CHECK_OR_RETURN( | |||||
| find_tool.used_count(get_var_shape->input(0)->owner_opr()) == 2); | |||||
| bool same_axis = get_var_shape->param().axis == reduce->param().axis; | |||||
| CHECK_OR_RETURN(same_axis); | |||||
| auto new_reduce_param = reduce->param(); | |||||
| new_reduce_param.mode = opr::Reduce::Mode::MEAN; | |||||
| auto new_node = | |||||
| opr::Reduce::make(rewriter.get_var(reduce->input(0)), new_reduce_param); | |||||
| if (remove_axis) { | |||||
| new_node = opr::AxisAddRemove::make( | |||||
| new_node, remove_axis->param(), remove_axis->config()); | |||||
| } | |||||
| rewriter.replace_var( | |||||
| opr->output(0), new_node.node(), | |||||
| mgb_cstr_log("replace reduce_sum+div_axis -> reduce_mean")); | |||||
| return true; | |||||
| }; | |||||
| auto on_opr = [&try_fuse_reduce_mean, &rewriter](OperatorNodeBase* opr) { | |||||
| if (!try_fuse_reduce_mean(opr)) { | |||||
| rewriter.auto_replace_outputs(opr); | |||||
| } | |||||
| }; | |||||
| opt.graph().iter(on_opr); | |||||
| rewriter.apply_inplace(); | |||||
| MIDOUT_E | |||||
| } | |||||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||||
| @@ -722,7 +722,10 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( | |||||
| options.disable_##_option(); \ | options.disable_##_option(); \ | ||||
| } \ | } \ | ||||
| } | } | ||||
| cb(fuse_grain, { | |||||
| add_pass<FoldingReduceMeanPass>(); | |||||
| add_pass<FoldingGlobalPoolingPass>(); | |||||
| }); | |||||
| cb(fuse_preprocess, { | cb(fuse_preprocess, { | ||||
| add_pass(FuseNCHW4Int8Preprocess::make()); | add_pass(FuseNCHW4Int8Preprocess::make()); | ||||
| add_pass<FuseWarpPerspectiveDimshufflePass>(); | add_pass<FuseWarpPerspectiveDimshufflePass>(); | ||||
| @@ -0,0 +1,35 @@ | |||||
| #pragma once | |||||
| #include "../../core/impl/graph/cg_impl.h" | |||||
| #include "megbrain/gopt/inference.h" | |||||
| #include "megbrain/utils/hash_ct.h" | |||||
| namespace mgb { | |||||
| namespace gopt { | |||||
| namespace { | |||||
| #define CHECK_OR_RETURN(x) \ | |||||
| if (!(x)) { \ | |||||
| return false; \ | |||||
| } | |||||
| class FindNext { | |||||
| using DepType = cg::OperatorNodeProp::DepType; | |||||
| public: | |||||
| FindNext(OptState& opt) { | |||||
| opt.graph().iter([&](OperatorNodeBase* opr) { | |||||
| for (auto&& i : opr->node_prop().dep_map()) { | |||||
| m_readers[i.first->owner_opr()].emplace_back(opr, i.second); | |||||
| } | |||||
| }); | |||||
| } | |||||
| size_t used_count(OperatorNodeBase* opr) { return m_readers[opr].size(); } | |||||
| private: | |||||
| ThinHashMap<OperatorNodeBase*, SmallVector<std::pair<OperatorNodeBase*, DepType>>> | |||||
| m_readers; | |||||
| }; | |||||
| } // namespace | |||||
| } // namespace gopt | |||||
| } // namespace mgb | |||||
| @@ -4,6 +4,7 @@ | |||||
| #include "megbrain/graph/event.h" | #include "megbrain/graph/event.h" | ||||
| #include "megbrain/opr/basic_arith.h" | #include "megbrain/opr/basic_arith.h" | ||||
| #include "megbrain/opr/blas.h" | #include "megbrain/opr/blas.h" | ||||
| #include "megbrain/opr/dnn/adaptive_pooling.h" | |||||
| #include "megbrain/opr/dnn/batch_norm.h" | #include "megbrain/opr/dnn/batch_norm.h" | ||||
| #include "megbrain/opr/dnn/convolution.h" | #include "megbrain/opr/dnn/convolution.h" | ||||
| #include "megbrain/opr/dnn/local.h" | #include "megbrain/opr/dnn/local.h" | ||||
| @@ -1368,6 +1369,8 @@ void EnableNchwxxPass::fill_opr_convert_fun(size_t pack_c_size) { | |||||
| megdnn::param::Convolution::Format::NCHW88; | megdnn::param::Convolution::Format::NCHW88; | ||||
| megdnn::param::Pooling::Format pooling_format = | megdnn::param::Pooling::Format pooling_format = | ||||
| megdnn::param::Pooling::Format::NCHW88; | megdnn::param::Pooling::Format::NCHW88; | ||||
| megdnn::param::AdaptivePooling::Format adapt_pooling_format = | |||||
| megdnn::param::AdaptivePooling::Format::NCHW88; | |||||
| megdnn::param::Resize::Format resize_format = megdnn::param::Resize::Format::NCHW88; | megdnn::param::Resize::Format resize_format = megdnn::param::Resize::Format::NCHW88; | ||||
| std::string convter_pass_name = "conv_format_nchw88"; | std::string convter_pass_name = "conv_format_nchw88"; | ||||
| @@ -1381,6 +1384,7 @@ void EnableNchwxxPass::fill_opr_convert_fun(size_t pack_c_size) { | |||||
| conv_bias_format = megdnn::param::ConvBias::Format::NCHW44; | conv_bias_format = megdnn::param::ConvBias::Format::NCHW44; | ||||
| conv_format = megdnn::param::Convolution::Format::NCHW44; | conv_format = megdnn::param::Convolution::Format::NCHW44; | ||||
| pooling_format = megdnn::param::Pooling::Format::NCHW44; | pooling_format = megdnn::param::Pooling::Format::NCHW44; | ||||
| adapt_pooling_format = megdnn::param::AdaptivePooling::Format::NCHW44; | |||||
| resize_format = megdnn::param::Resize::Format::NCHW44; | resize_format = megdnn::param::Resize::Format::NCHW44; | ||||
| convter_pass_name = "conv_format_nchw44"; | convter_pass_name = "conv_format_nchw44"; | ||||
| } | } | ||||
| @@ -1646,6 +1650,33 @@ void EnableNchwxxPass::fill_opr_convert_fun(size_t pack_c_size) { | |||||
| return new_opr; | return new_opr; | ||||
| } | } | ||||
| }; | }; | ||||
| auto replace_adapt_pooling_opr = [=](OperatorNodeBase* opr, | |||||
| const VarNodeArray& new_inp) { | |||||
| mgb_assert(opr->input().size() == new_inp.size()); | |||||
| auto& pooling_opr = opr->cast_final_safe<opr::AdaptivePooling>(); | |||||
| mgb_throw_if( | |||||
| pooling_opr.param().format != | |||||
| opr::AdaptivePoolingForward::Param::Format::NCHW, | |||||
| MegBrainError, | |||||
| "ConvertFormat Pass only support converting NCHW to NCHWxx"); | |||||
| VarNode* inp_0 = new_inp[0]; | |||||
| VarNode* inp_1 = new_inp[1]; | |||||
| //! if input is nchwxx | |||||
| if (inp_0->shape().ndim == 5) { | |||||
| auto new_param = pooling_opr.param(); | |||||
| new_param.format = adapt_pooling_format; | |||||
| auto new_pooling_opr = opr::AdaptivePoolingForward::make( | |||||
| inp_0, inp_1, new_param, opr->config()); | |||||
| mgb_assert( | |||||
| new_pooling_opr.shape().ndim == 5, | |||||
| "The pooling dst dim is not trans to nchwxx"); | |||||
| return new_pooling_opr.node()->owner_opr(); | |||||
| } else { | |||||
| auto new_opr = | |||||
| serialization::copy_opr_shallow(*opr, new_inp, opr->config()); | |||||
| return new_opr; | |||||
| } | |||||
| }; | |||||
| auto replace_resize_opr = [=](OperatorNodeBase* opr, const VarNodeArray& new_inp) { | auto replace_resize_opr = [=](OperatorNodeBase* opr, const VarNodeArray& new_inp) { | ||||
| mgb_assert(opr->input().size() == new_inp.size()); | mgb_assert(opr->input().size() == new_inp.size()); | ||||
| @@ -1763,6 +1794,7 @@ void EnableNchwxxPass::fill_opr_convert_fun(size_t pack_c_size) { | |||||
| replace_func[opr::Convolution::typeinfo()] = replace_conv_opr; | replace_func[opr::Convolution::typeinfo()] = replace_conv_opr; | ||||
| replace_func[opr::ConvBias::typeinfo()] = replace_conv_bias_opr; | replace_func[opr::ConvBias::typeinfo()] = replace_conv_bias_opr; | ||||
| replace_func[opr::PoolingForward::typeinfo()] = replace_pooling_opr; | replace_func[opr::PoolingForward::typeinfo()] = replace_pooling_opr; | ||||
| replace_func[opr::AdaptivePooling::typeinfo()] = replace_adapt_pooling_opr; | |||||
| replace_func[opr::ResizeForward::typeinfo()] = replace_resize_opr; | replace_func[opr::ResizeForward::typeinfo()] = replace_resize_opr; | ||||
| replace_func[opr::Concat::typeinfo()] = replace_multi_inp_opr; | replace_func[opr::Concat::typeinfo()] = replace_multi_inp_opr; | ||||
| replace_func[opr::Elemwise::typeinfo()] = replace_multi_inp_opr; | replace_func[opr::Elemwise::typeinfo()] = replace_multi_inp_opr; | ||||
| @@ -327,6 +327,8 @@ struct OptimizeForInferenceOptions : cg::GraphCommonOptimizeOptions { | |||||
| ret |= 1u << 4; | ret |= 1u << 4; | ||||
| if (fuse_preprocess) | if (fuse_preprocess) | ||||
| ret |= 1u << 5; | ret |= 1u << 5; | ||||
| if (fuse_grain) | |||||
| ret |= 1u << 6; | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -338,6 +340,7 @@ struct OptimizeForInferenceOptions : cg::GraphCommonOptimizeOptions { | |||||
| ret.fuse_conv_bias_with_z = buf & 1u << 3; | ret.fuse_conv_bias_with_z = buf & 1u << 3; | ||||
| ret.weight_preprocess = buf & 1u << 4; | ret.weight_preprocess = buf & 1u << 4; | ||||
| ret.fuse_preprocess = buf & 1u << 5; | ret.fuse_preprocess = buf & 1u << 5; | ||||
| ret.fuse_grain = buf & 1u << 6; | |||||
| ret.layout_transform = (LayoutTransform)(buf >> 32); | ret.layout_transform = (LayoutTransform)(buf >> 32); | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -477,6 +480,25 @@ public: | |||||
| void apply(OptState& opt) const override; | void apply(OptState& opt) const override; | ||||
| }; | }; | ||||
| #endif | #endif | ||||
| /** | |||||
| * \brief old megbrain support reduce_mean by reduce_sum and div, fuse it for efficient | |||||
| * | |||||
| */ | |||||
| class FoldingReduceMeanPass final : public Pass { | |||||
| public: | |||||
| const char* name() const override; | |||||
| void apply(OptState& opt) const override; | |||||
| }; | |||||
| /** | |||||
| * \brief fold reduce hw to global pooling, for nchwxx optimize | |||||
| * | |||||
| */ | |||||
| class FoldingGlobalPoolingPass final : public Pass { | |||||
| public: | |||||
| const char* name() const override; | |||||
| void apply(OptState& opt) const override; | |||||
| }; | |||||
| /*! | /*! | ||||
| * \brief padding channel to enable fast int8/int4 support | * \brief padding channel to enable fast int8/int4 support | ||||
| @@ -0,0 +1,249 @@ | |||||
| #include "megbrain/opr/dnn/local.h" | |||||
| #include "megbrain/test/helper.h" | |||||
| #include "megbrain/gopt/basic_arith.h" | |||||
| #include "megbrain/gopt/gtrans.h" | |||||
| #include "megbrain/gopt/inference.h" | |||||
| #include "megbrain/opr/basic_arith_wrapper.h" | |||||
| #include "megbrain/opr/blas.h" | |||||
| #include "megbrain/opr/dnn/adaptive_pooling.h" | |||||
| #include "megbrain/opr/io.h" | |||||
| #include "megbrain/opr/nn_int.h" | |||||
| #include "megbrain/opr/tensor_gen.h" | |||||
| #include "megbrain/opr/tensor_manip.h" | |||||
| #include "megbrain/opr/utility.h" | |||||
| #include "./helper.h" | |||||
| #include "megbrain/comp_node_env.h" | |||||
| #include "megdnn/tensor_format.h" | |||||
| #include <random> | |||||
| #include <vector> | |||||
| using namespace mgb; | |||||
| namespace { | |||||
| //! find first the operator of specific type; raise exception if not found | |||||
| template <typename T> | |||||
| T& find_opr(SymbolVar endpoint) { | |||||
| T* found = nullptr; | |||||
| auto cb = [&found](cg::OperatorNodeBase* opr) { | |||||
| if (!found && opr->same_type<T>()) { | |||||
| found = &opr->cast_final_safe<T>(); | |||||
| } | |||||
| }; | |||||
| cg::DepOprIter{cb}.add(endpoint.node()->owner_opr()); | |||||
| mgb_assert(found, "not found opr from %s", endpoint.node()->name().c_str()); | |||||
| return *found; | |||||
| } | |||||
| template <typename T> | |||||
| T& find_opr(SymbolVar endpoint, const std::string& node_name) { | |||||
| T* found = nullptr; | |||||
| auto cb = [&found, &node_name](cg::OperatorNodeBase* opr) { | |||||
| if (!found && opr->same_type<T>() && opr->name() == node_name) { | |||||
| found = &opr->cast_final_safe<T>(); | |||||
| } | |||||
| }; | |||||
| cg::DepOprIter{cb}.add(endpoint.node()->owner_opr()); | |||||
| mgb_assert( | |||||
| found, "not found opr %s from %s", node_name.c_str(), | |||||
| endpoint.node()->name().c_str()); | |||||
| return *found; | |||||
| } | |||||
| template <typename T> | |||||
| size_t find_opr_num(SymbolVar endpoint) { | |||||
| size_t opr_num = 0; | |||||
| auto cb = [&opr_num](cg::OperatorNodeBase* opr) { | |||||
| if (opr->same_type<T>()) { | |||||
| opr_num++; | |||||
| } | |||||
| }; | |||||
| cg::DepOprIter{cb}.add(endpoint.node()->owner_opr()); | |||||
| return opr_num; | |||||
| } | |||||
| } // namespace | |||||
| TEST(TestGoptOldModel, FoldingGlobalPooling) { | |||||
| HostTensorGenerator<> gen; | |||||
| auto cn = CompNode::load("cpu0"); | |||||
| auto graph = ComputingGraph::make(); | |||||
| graph->options().graph_opt_level = 0; | |||||
| auto mkcvar = [&](const char* name, const TensorShape& shp) { | |||||
| return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name); | |||||
| }; | |||||
| auto host_x = gen({2, 3, 16, 16}, cn); | |||||
| auto x = opr::Host2DeviceCopy::make(*graph, host_x); | |||||
| opr::Convolution::Param param_conv; | |||||
| param_conv.stride_h = param_conv.stride_w = 1; | |||||
| param_conv.pad_h = param_conv.pad_w = 1; | |||||
| auto w1 = mkcvar("w1", {8, 3, 3, 3}); | |||||
| auto conv1 = | |||||
| opr::Convolution::make(x, w1, param_conv, {}, OperatorNodeConfig("conv1")); | |||||
| auto conv_n = opr::GetVarShape::make(conv1, 0); | |||||
| auto conv_c = opr::GetVarShape::make(conv1, 1); | |||||
| auto conv_h = opr::GetVarShape::make(conv1, 2); | |||||
| auto conv_w = opr::GetVarShape::make(conv1, 3); | |||||
| auto hxw = conv_h * conv_w; | |||||
| auto reshape_shape = opr::Concat::make({conv_n, conv_c, hxw}, 0); | |||||
| auto reshape1 = opr::Reshape::make(conv1, reshape_shape); | |||||
| opr::Reduce::Param param_reduce; | |||||
| param_reduce.axis = 2; | |||||
| param_reduce.mode = opr::Reduce::Mode::SUM; | |||||
| auto reduce = opr::Reduce::make(reshape1, param_reduce); | |||||
| auto reduce_remove_axis = opr::AxisAddRemove::make( | |||||
| reduce, {opr::AxisAddRemove::AxisDesc::make_remove(2)}); | |||||
| auto hw_count = opr::GetVarShape::make(reshape1, 2); | |||||
| auto fp32_hw_count = opr::TypeCvt::make(hw_count, dtype::Float32()); | |||||
| auto true_div = reduce_remove_axis / fp32_hw_count; | |||||
| auto y = opr::AxisAddRemove::make( | |||||
| true_div, {opr::AxisAddRemove::AxisDesc::make_add(2), | |||||
| opr::AxisAddRemove::AxisDesc::make_add(3)}); | |||||
| SymbolVar y_opt = y; | |||||
| { | |||||
| auto options = gopt::OptimizeForInferenceOptions{}; | |||||
| options.fuse_grain = true; | |||||
| unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); | |||||
| } | |||||
| ASSERT_EQ( | |||||
| opr::AdaptivePooling::Param::Mode::AVERAGE, | |||||
| find_opr<opr::AdaptivePooling>(y_opt).param().mode); | |||||
| graph->compile({{y_opt, {}}}) | |||||
| ->to_json() | |||||
| ->writeto_fpath(output_file("TestGoptOldModel.FoldingGlobalPooling.json")); | |||||
| HostTensorND host_y_opt, host_y; | |||||
| auto func = graph->compile( | |||||
| {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)}); | |||||
| func->execute(); | |||||
| MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3); | |||||
| } | |||||
| TEST(TestGoptOldModel, FoldingGlobalPooling2) { | |||||
| HostTensorGenerator<> gen; | |||||
| auto cn = CompNode::load("cpu0"); | |||||
| auto graph = ComputingGraph::make(); | |||||
| graph->options().graph_opt_level = 0; | |||||
| auto mkcvar = [&](const char* name, const TensorShape& shp) { | |||||
| return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name); | |||||
| }; | |||||
| auto host_x = gen({2, 3, 16, 16}, cn); | |||||
| auto x = opr::Host2DeviceCopy::make(*graph, host_x); | |||||
| opr::Convolution::Param param_conv; | |||||
| param_conv.stride_h = param_conv.stride_w = 1; | |||||
| param_conv.pad_h = param_conv.pad_w = 1; | |||||
| auto w1 = mkcvar("w1", {8, 3, 3, 3}); | |||||
| auto conv1 = | |||||
| opr::Convolution::make(x, w1, param_conv, {}, OperatorNodeConfig("conv1")); | |||||
| auto conv_n = opr::GetVarShape::make(conv1, 0); | |||||
| auto conv_c = opr::GetVarShape::make(conv1, 1); | |||||
| auto conv_h = opr::GetVarShape::make(conv1, 2); | |||||
| auto conv_w = opr::GetVarShape::make(conv1, 3); | |||||
| auto hxw = conv_h * conv_w; | |||||
| auto reshape_shape = opr::Concat::make({conv_n, conv_c, hxw}, 0); | |||||
| auto reshape1 = opr::Reshape::make(conv1, reshape_shape); | |||||
| opr::Reduce::Param param_reduce; | |||||
| param_reduce.axis = 2; | |||||
| param_reduce.mode = opr::Reduce::Mode::SUM; | |||||
| auto reduce = opr::Reduce::make(reshape1, param_reduce); | |||||
| auto reduce_remove_axis = opr::AxisAddRemove::make( | |||||
| reduce, {opr::AxisAddRemove::AxisDesc::make_remove(2)}); | |||||
| auto hw_count = opr::GetVarShape::make(reshape1, 2); | |||||
| auto fp32_hw_count = opr::TypeCvt::make(hw_count, dtype::Float32()); | |||||
| auto true_div = reduce_remove_axis / fp32_hw_count; | |||||
| auto y = opr::Dimshuffle::make(true_div, {0, 1, -1, -1}); | |||||
| SymbolVar y_opt = y; | |||||
| { | |||||
| auto options = gopt::OptimizeForInferenceOptions{}; | |||||
| options.fuse_grain = true; | |||||
| unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); | |||||
| } | |||||
| ASSERT_EQ( | |||||
| opr::AdaptivePooling::Param::Mode::AVERAGE, | |||||
| find_opr<opr::AdaptivePooling>(y_opt).param().mode); | |||||
| graph->compile({{y_opt, {}}}) | |||||
| ->to_json() | |||||
| ->writeto_fpath(output_file("TestGoptOldModel.FoldingGlobalPooling2.json")); | |||||
| HostTensorND host_y_opt, host_y; | |||||
| auto func = graph->compile( | |||||
| {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)}); | |||||
| func->execute(); | |||||
| MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3); | |||||
| } | |||||
| TEST(TestGoptOldModel, FoldingReduceMean) { | |||||
| HostTensorGenerator<> gen; | |||||
| auto cn = CompNode::load("cpu0"); | |||||
| auto graph = ComputingGraph::make(); | |||||
| graph->options().graph_opt_level = 0; | |||||
| auto mkcvar = [&](const char* name, const TensorShape& shp) { | |||||
| return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name); | |||||
| }; | |||||
| auto host_x = gen({2, 3, 16, 16}, cn); | |||||
| auto x = opr::Host2DeviceCopy::make(*graph, host_x); | |||||
| opr::Convolution::Param param_conv; | |||||
| param_conv.stride_h = param_conv.stride_w = 1; | |||||
| param_conv.pad_h = param_conv.pad_w = 1; | |||||
| auto w1 = mkcvar("w1", {8, 3, 3, 3}); | |||||
| auto conv1 = | |||||
| opr::Convolution::make(x, w1, param_conv, {}, OperatorNodeConfig("conv1")); | |||||
| auto conv_n = opr::GetVarShape::make(conv1, 0); | |||||
| auto conv_c = opr::GetVarShape::make(conv1, 1); | |||||
| auto conv_h = opr::GetVarShape::make(conv1, 2); | |||||
| auto conv_w = opr::GetVarShape::make(conv1, 3); | |||||
| auto hxw = conv_h * conv_w; | |||||
| auto reshape_shape = opr::Concat::make({conv_n, conv_c, hxw}, 0); | |||||
| auto reshape1 = opr::Reshape::make(conv1, reshape_shape); | |||||
| opr::Reduce::Param param_reduce; | |||||
| param_reduce.axis = 2; | |||||
| param_reduce.mode = opr::Reduce::Mode::SUM; | |||||
| auto reduce = opr::Reduce::make(reshape1, param_reduce); | |||||
| auto hw_count = opr::GetVarShape::make(reshape1, 2); | |||||
| auto y = reduce / hw_count; | |||||
| SymbolVar y_opt = y; | |||||
| { | |||||
| auto options = gopt::OptimizeForInferenceOptions{}; | |||||
| options.fuse_grain = true; | |||||
| unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); | |||||
| } | |||||
| ASSERT_EQ( | |||||
| opr::Reduce::Param::Mode::MEAN, find_opr<opr::Reduce>(y_opt).param().mode); | |||||
| graph->compile({{y_opt, {}}}) | |||||
| ->to_json() | |||||
| ->writeto_fpath(output_file("TestGoptOldModel.FoldingReduceMean.json")); | |||||
| HostTensorND host_y_opt, host_y; | |||||
| auto func = graph->compile( | |||||
| {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)}); | |||||
| func->execute(); | |||||
| MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3); | |||||
| *host_x = *gen({2, 3, 16, 16}, cn); | |||||
| func->execute(); | |||||
| MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3); | |||||
| } | |||||
| @@ -7,6 +7,7 @@ | |||||
| #include "megbrain/opr/basic_arith_wrapper.h" | #include "megbrain/opr/basic_arith_wrapper.h" | ||||
| #include "megbrain/opr/blas.h" | #include "megbrain/opr/blas.h" | ||||
| #include "megbrain/opr/dnn/adaptive_pooling.h" | |||||
| #include "megbrain/opr/dnn/batch_norm.h" | #include "megbrain/opr/dnn/batch_norm.h" | ||||
| #include "megbrain/opr/dnn/convolution.h" | #include "megbrain/opr/dnn/convolution.h" | ||||
| #include "megbrain/opr/dnn/pooling.h" | #include "megbrain/opr/dnn/pooling.h" | ||||
| @@ -4113,6 +4114,71 @@ TEST(TestGoptInference, ConvertFormatNCHW44Reshape) { | |||||
| MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1); | MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1); | ||||
| } | } | ||||
| TEST(TestGoptInference, ConvertFormatNCHW44GlobalPooling) { | |||||
| HostTensorGenerator<> gen; | |||||
| auto cn = CompNode::load("cpu0"); | |||||
| auto graph = ComputingGraph::make(); | |||||
| graph->options().graph_opt_level = 0; | |||||
| auto mkcvar = [&](const char* name, const TensorShape& shp) { | |||||
| return opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name); | |||||
| }; | |||||
| auto host_x1 = gen({1, 4, 16, 16}, cn); | |||||
| auto x = opr::Host2DeviceCopy::make(*graph, host_x1); | |||||
| opr::Convolution::Param param_conv; | |||||
| param_conv.stride_h = param_conv.stride_w = 1; | |||||
| param_conv.pad_h = param_conv.pad_w = 1; | |||||
| auto w1 = mkcvar("w1", {8, 4, 3, 3}); | |||||
| auto conv1 = | |||||
| opr::Convolution::make(x, w1, param_conv, {}, OperatorNodeConfig("conv1")); | |||||
| auto conv_n = opr::GetVarShape::make(conv1, 0); | |||||
| auto conv_c = opr::GetVarShape::make(conv1, 1); | |||||
| auto conv_h = opr::GetVarShape::make(conv1, 2); | |||||
| auto conv_w = opr::GetVarShape::make(conv1, 3); | |||||
| auto hxw = conv_h * conv_w; | |||||
| auto reshape_shape = opr::Concat::make({conv_n, conv_c, hxw}, 0); | |||||
| auto reshape1 = opr::Reshape::make(conv1, reshape_shape); | |||||
| opr::Reduce::Param param_reduce; | |||||
| param_reduce.axis = 2; | |||||
| param_reduce.mode = opr::Reduce::Mode::SUM; | |||||
| auto reduce = opr::Reduce::make(reshape1, param_reduce); | |||||
| auto reduce_remove_axis = opr::AxisAddRemove::make( | |||||
| reduce, {opr::AxisAddRemove::AxisDesc::make_remove(2)}); | |||||
| auto hw_count = opr::GetVarShape::make(reshape1, 2); | |||||
| auto fp32_hw_count = opr::TypeCvt::make(hw_count, dtype::Float32()); | |||||
| auto reduce_mean = reduce_remove_axis / fp32_hw_count; | |||||
| auto global_pool = opr::AxisAddRemove::make( | |||||
| reduce_mean, {opr::AxisAddRemove::AxisDesc::make_add(2), | |||||
| opr::AxisAddRemove::AxisDesc::make_add(3)}); | |||||
| opr::Elemwise::Param elem_param; | |||||
| elem_param.mode = opr::Elemwise::Param::Mode::RELU; | |||||
| auto y = opr::Elemwise::make({global_pool}, elem_param); | |||||
| SymbolVar y_opt; | |||||
| auto options = gopt::OptimizeForInferenceOptions{}; | |||||
| options.enable_fuse_grain(); | |||||
| options.enable_nchw44(); | |||||
| unpack_vector(gopt::optimize_for_inference({y}, options), y_opt); | |||||
| ASSERT_EQ( | |||||
| opr::AdaptivePooling::Param::Format::NCHW44, | |||||
| find_opr<opr::AdaptivePooling>(y_opt).param().format); | |||||
| graph->compile({{y_opt, {}}}) | |||||
| ->to_json() | |||||
| ->writeto_fpath(output_file( | |||||
| "TestGoptInference.ConvertFormatNCHW44GlobalPooling.json")); | |||||
| HostTensorND host_y_opt, host_y; | |||||
| auto func = graph->compile( | |||||
| {make_callback_copy(y, host_y), make_callback_copy(y_opt, host_y_opt)}); | |||||
| func->execute(); | |||||
| //! meybe go to winograd in x86-32, so set error 1e-1 | |||||
| MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-1); | |||||
| } | |||||
| TEST(TestGoptInference, ConvertFormatNCHW44_DOT) { | TEST(TestGoptInference, ConvertFormatNCHW44_DOT) { | ||||
| HostTensorGenerator<> gen; | HostTensorGenerator<> gen; | ||||
| auto cn = CompNode::load("cpu0"); | auto cn = CompNode::load("cpu0"); | ||||
| @@ -39,7 +39,7 @@ void AdaptivePoolingForward::outshape_by_symvar_do_get_output_shape( | |||||
| cg::copy_tensor_value_to_shape(oshp2d, *shpinfo.shpval_inp_val.at(0)); | cg::copy_tensor_value_to_shape(oshp2d, *shpinfo.shpval_inp_val.at(0)); | ||||
| auto src = shpinfo.shape_inp_shp.at(0); | auto src = shpinfo.shape_inp_shp.at(0); | ||||
| mgb_assert( | mgb_assert( | ||||
| src.ndim == 4 && (oshp2d.ndim == 2 || oshp2d.ndim == 1), | |||||
| (src.ndim == 4 || src.ndim == 5) && (oshp2d.ndim == 2 || oshp2d.ndim == 1), | |||||
| "shape mismatch for AdaptivePooling: src=%s, out2d=%s", | "shape mismatch for AdaptivePooling: src=%s, out2d=%s", | ||||
| src.to_string().c_str(), oshp2d.to_string().c_str()); | src.to_string().c_str(), oshp2d.to_string().c_str()); | ||||
| @@ -57,8 +57,19 @@ void AdaptivePoolingForward::outshape_by_symvar_do_get_output_shape( | |||||
| dest.shape[1] = oshp2d.shape[0]; | dest.shape[1] = oshp2d.shape[0]; | ||||
| dest.shape[2] = (tshp1n) ? oshp2d.shape[0] : oshp2d.shape[1]; | dest.shape[2] = (tshp1n) ? oshp2d.shape[0] : oshp2d.shape[1]; | ||||
| dest.shape[3] = src.shape[3]; | dest.shape[3] = src.shape[3]; | ||||
| } else if ( | |||||
| param_format == Param::Format::NCHW44 || | |||||
| param_format == Param::Format::NCHW88) { | |||||
| dest.ndim = 5; | |||||
| dest.shape[0] = src.shape[0]; | |||||
| dest.shape[1] = src.shape[1]; | |||||
| dest.shape[2] = oshp2d.shape[0]; | |||||
| dest.shape[3] = (tshp1n) ? oshp2d.shape[0] : oshp2d.shape[1]; | |||||
| dest.shape[4] = src.shape[4]; | |||||
| } else { | } else { | ||||
| mgb_throw(MegBrainError, "AdaptivePooling only support NCHW or NHWC format"); | |||||
| mgb_throw( | |||||
| MegBrainError, "AdaptivePooling not support %d format", | |||||
| (int)param_format); | |||||
| } | } | ||||
| } | } | ||||
| @@ -48,7 +48,6 @@ void run(Param::Mode mode) { | |||||
| Checker::RunOptions opt; | Checker::RunOptions opt; | ||||
| opt.numdiff_max_err = 1e-2; | opt.numdiff_max_err = 1e-2; | ||||
| Checker checker{make_graph, fwd}; | Checker checker{make_graph, fwd}; | ||||
| checker.set_input_allow_grad(1, false).set_input_generator(0, gen); | checker.set_input_allow_grad(1, false).set_input_generator(0, gen); | ||||
| checker.run({TensorShape{1, 1, 10, 7}, TensorShape{5, 4}}, opt); | checker.run({TensorShape{1, 1, 10, 7}, TensorShape{5, 4}}, opt); | ||||