GitOrigin-RevId: d1b95a6f01
tags/v1.5.0
| @@ -1060,6 +1060,46 @@ TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) { | |||||
| param::ConvBias::Format::CHWN4); | param::ConvBias::Format::CHWN4); | ||||
| } | } | ||||
| TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_NCHW) { | |||||
| CUBenchmarker<ConvBiasForward> benchmarker(handle_cuda()); | |||||
| size_t RUNS = 1000; | |||||
| benchmarker.set_display(false).set_times(RUNS); | |||||
| using namespace conv_bias; | |||||
| UniformIntRNG int_rng{-3, 3}; | |||||
| UniformIntRNG bias_rng{-50, 50}; | |||||
| ConvBias::Param param; | |||||
| param.format = ConvBias::Param::Format::NCHW4_NCHW; | |||||
| param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY; | |||||
| benchmarker.set_before_exec_callback( | |||||
| conv_bias::ConvBiasAlgoChecker<ConvBiasForward>( | |||||
| "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM")); | |||||
| benchmarker.set_dtype(0, dtype::QuantizedS8(1.9980618f)) | |||||
| .set_dtype(1, dtype::QuantizedS8(1.9980927f)) | |||||
| .set_dtype(2, dtype::Float32()) | |||||
| .set_dtype(3, dtype::Float32()) | |||||
| .set_dtype(4, dtype::Float32()) | |||||
| .set_rng(0, &int_rng) | |||||
| .set_rng(1, &int_rng) | |||||
| .set_param(param); | |||||
| auto run = [&](const TensorShapeArray& shapes) { | |||||
| auto time_in_ms = | |||||
| benchmarker.execs({shapes[0], shapes[1], shapes[2], {}, {}}) / | |||||
| RUNS; | |||||
| printf("src=%s, filter=%s, dst=%s, time=%.2f\n", | |||||
| shapes[0].to_string().c_str(), shapes[1].to_string().c_str(), | |||||
| shapes[2].to_string().c_str(), time_in_ms); | |||||
| }; | |||||
| run({{16, 16, 224, 224, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}}); | |||||
| run({{16, 16, 92, 160, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}}); | |||||
| run({{16, 16, 46, 80, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}}); | |||||
| } | |||||
| #if CUDA_VERSION >= 10020 | #if CUDA_VERSION >= 10020 | ||||
| TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) { | TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) { | ||||
| @@ -772,7 +772,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( | |||||
| add_pass<RemoveRedundantTypeCvtPass>(); | add_pass<RemoveRedundantTypeCvtPass>(); | ||||
| add_pass(FuseNCHW4Int8Preprocess::make()); | add_pass(FuseNCHW4Int8Preprocess::make()); | ||||
| add_pass<FuseWarpPerspectiveDimshufflePass>(); | add_pass<FuseWarpPerspectiveDimshufflePass>(); | ||||
| #if CUDA_VERSION >= 10020 | |||||
| add_pass<FoldingConvBiasDimshufflePass>(); | add_pass<FoldingConvBiasDimshufflePass>(); | ||||
| #endif | |||||
| }); | }); | ||||
| cb(chwn4, { | cb(chwn4, { | ||||
| add_pass<FuseConvBiasNonlinPass>(); | add_pass<FuseConvBiasNonlinPass>(); | ||||
| @@ -791,7 +793,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( | |||||
| add_pass<RemoveRedundantTypeCvtPass>(); | add_pass<RemoveRedundantTypeCvtPass>(); | ||||
| add_pass(FuseNCHW4Int8Preprocess::make()); | add_pass(FuseNCHW4Int8Preprocess::make()); | ||||
| add_pass<FuseWarpPerspectiveDimshufflePass>(); | add_pass<FuseWarpPerspectiveDimshufflePass>(); | ||||
| #if CUDA_VERSION >= 10020 | |||||
| add_pass<FoldingConvBiasDimshufflePass>(); | add_pass<FoldingConvBiasDimshufflePass>(); | ||||
| #endif | |||||
| }); | }); | ||||
| cb(fuse_conv_bias_nonlinearity, { add_pass<FuseConvBiasNonlinPass>(); }); | cb(fuse_conv_bias_nonlinearity, { add_pass<FuseConvBiasNonlinPass>(); }); | ||||
| @@ -3638,6 +3638,7 @@ void ShuffleShuffleRemovePass::apply(OptState& opt) const { | |||||
| MIDOUT_E | MIDOUT_E | ||||
| } | } | ||||
| #if CUDA_VERSION >= 10020 | |||||
| /* ==================== FoldingConvBiasDimshufflePass ================= */ | /* ==================== FoldingConvBiasDimshufflePass ================= */ | ||||
| const char* FoldingConvBiasDimshufflePass::name() const { | const char* FoldingConvBiasDimshufflePass::name() const { | ||||
| return mgb_cstr_log("folding conv bias dimshuffle pass"); | return mgb_cstr_log("folding conv bias dimshuffle pass"); | ||||
| @@ -4068,20 +4069,17 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { | |||||
| return true; | return true; | ||||
| }; | }; | ||||
| MGB_MARK_USED_VAR(try_conv_reformat_nchw322nchw4); | MGB_MARK_USED_VAR(try_conv_reformat_nchw322nchw4); | ||||
| MGB_MARK_USED_VAR(try_conv_reformat_nchw42nchw32); | |||||
| auto on_opr = [&try_conv_dimshuffle_reshape_typecvt, | auto on_opr = [&try_conv_dimshuffle_reshape_typecvt, | ||||
| &try_conv_reformat_nchw42nchw32, | &try_conv_reformat_nchw42nchw32, | ||||
| &try_conv_reformat_nchw42nhwc, | &try_conv_reformat_nchw42nhwc, | ||||
| #if CUDA_VERSION >= 10020 | |||||
| &try_conv_reformat_nchw322nchw4, | &try_conv_reformat_nchw322nchw4, | ||||
| #endif | |||||
| &rewriter](OperatorNodeBase* opr) { | &rewriter](OperatorNodeBase* opr) { | ||||
| if (!try_conv_dimshuffle_reshape_typecvt(opr) && | if (!try_conv_dimshuffle_reshape_typecvt(opr) && | ||||
| !try_conv_reformat_nchw42nchw32(opr) && | !try_conv_reformat_nchw42nchw32(opr) && | ||||
| !try_conv_reformat_nchw42nhwc(opr) | !try_conv_reformat_nchw42nhwc(opr) | ||||
| #if CUDA_VERSION >= 10020 | |||||
| && !try_conv_reformat_nchw322nchw4(opr) | && !try_conv_reformat_nchw322nchw4(opr) | ||||
| #endif | |||||
| ) { | ) { | ||||
| rewriter.auto_replace_outputs(opr); | rewriter.auto_replace_outputs(opr); | ||||
| } | } | ||||
| @@ -4091,6 +4089,7 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { | |||||
| MIDOUT_E | MIDOUT_E | ||||
| } | } | ||||
| #endif | |||||
| /* ==================== PaddingChannelPass ================= */ | /* ==================== PaddingChannelPass ================= */ | ||||
| const char* PaddingChannelPass::name() const { | const char* PaddingChannelPass::name() const { | ||||
| @@ -16,6 +16,10 @@ | |||||
| #include "megbrain/opr/dnn/convolution.h" | #include "megbrain/opr/dnn/convolution.h" | ||||
| #include "megbrain/opr/search_policy/algo_chooser_helper.h" | #include "megbrain/opr/search_policy/algo_chooser_helper.h" | ||||
| #if MGB_CUDA | |||||
| #include <cuda.h> | |||||
| #endif | |||||
| namespace mgb { | namespace mgb { | ||||
| namespace gopt { | namespace gopt { | ||||
| @@ -427,11 +431,13 @@ namespace gopt { | |||||
| void apply(OptState& opt) const override; | void apply(OptState& opt) const override; | ||||
| }; | }; | ||||
| #if CUDA_VERSION >= 10020 | |||||
| class FoldingConvBiasDimshufflePass final : public Pass { | class FoldingConvBiasDimshufflePass final : public Pass { | ||||
| public: | public: | ||||
| const char* name() const override; | const char* name() const override; | ||||
| void apply(OptState& opt) const override; | void apply(OptState& opt) const override; | ||||
| }; | }; | ||||
| #endif | |||||
| /*! | /*! | ||||
| * \brief padding channel to enable fast int8/int4 support | * \brief padding channel to enable fast int8/int4 support | ||||
| @@ -4155,6 +4155,7 @@ TEST(TestGoptInference, WarpAndPreProcessCase1) { | |||||
| MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5); | MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5); | ||||
| } | } | ||||
| #if CUDA_VERSION >= 10020 | |||||
| TEST(TestGoptInference, FoldingConvDimshuffle) { | TEST(TestGoptInference, FoldingConvDimshuffle) { | ||||
| REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
| auto cn = CompNode::load("gpu0"); | auto cn = CompNode::load("gpu0"); | ||||
| @@ -4307,7 +4308,6 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) { | |||||
| MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse); | MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse); | ||||
| } | } | ||||
| #if CUDA_VERSION >= 10020 | |||||
| TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) { | TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) { | ||||
| REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
| auto cn = CompNode::load("gpu0"); | auto cn = CompNode::load("gpu0"); | ||||