GitOrigin-RevId: d1b95a6f01
tags/v1.5.0
| @@ -1060,6 +1060,46 @@ TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) { | |||
| param::ConvBias::Format::CHWN4); | |||
| } | |||
| TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_NCHW) { | |||
| CUBenchmarker<ConvBiasForward> benchmarker(handle_cuda()); | |||
| size_t RUNS = 1000; | |||
| benchmarker.set_display(false).set_times(RUNS); | |||
| using namespace conv_bias; | |||
| UniformIntRNG int_rng{-3, 3}; | |||
| UniformIntRNG bias_rng{-50, 50}; | |||
| ConvBias::Param param; | |||
| param.format = ConvBias::Param::Format::NCHW4_NCHW; | |||
| param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY; | |||
| benchmarker.set_before_exec_callback( | |||
| conv_bias::ConvBiasAlgoChecker<ConvBiasForward>( | |||
| "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM")); | |||
| benchmarker.set_dtype(0, dtype::QuantizedS8(1.9980618f)) | |||
| .set_dtype(1, dtype::QuantizedS8(1.9980927f)) | |||
| .set_dtype(2, dtype::Float32()) | |||
| .set_dtype(3, dtype::Float32()) | |||
| .set_dtype(4, dtype::Float32()) | |||
| .set_rng(0, &int_rng) | |||
| .set_rng(1, &int_rng) | |||
| .set_param(param); | |||
| auto run = [&](const TensorShapeArray& shapes) { | |||
| auto time_in_ms = | |||
| benchmarker.execs({shapes[0], shapes[1], shapes[2], {}, {}}) / | |||
| RUNS; | |||
| printf("src=%s, filter=%s, dst=%s, time=%.2f\n", | |||
| shapes[0].to_string().c_str(), shapes[1].to_string().c_str(), | |||
| shapes[2].to_string().c_str(), time_in_ms); | |||
| }; | |||
| run({{16, 16, 224, 224, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}}); | |||
| run({{16, 16, 92, 160, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}}); | |||
| run({{16, 16, 46, 80, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}}); | |||
| } | |||
| #if CUDA_VERSION >= 10020 | |||
| TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) { | |||
| @@ -772,7 +772,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( | |||
| add_pass<RemoveRedundantTypeCvtPass>(); | |||
| add_pass(FuseNCHW4Int8Preprocess::make()); | |||
| add_pass<FuseWarpPerspectiveDimshufflePass>(); | |||
| #if CUDA_VERSION >= 10020 | |||
| add_pass<FoldingConvBiasDimshufflePass>(); | |||
| #endif | |||
| }); | |||
| cb(chwn4, { | |||
| add_pass<FuseConvBiasNonlinPass>(); | |||
| @@ -791,7 +793,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options( | |||
| add_pass<RemoveRedundantTypeCvtPass>(); | |||
| add_pass(FuseNCHW4Int8Preprocess::make()); | |||
| add_pass<FuseWarpPerspectiveDimshufflePass>(); | |||
| #if CUDA_VERSION >= 10020 | |||
| add_pass<FoldingConvBiasDimshufflePass>(); | |||
| #endif | |||
| }); | |||
| cb(fuse_conv_bias_nonlinearity, { add_pass<FuseConvBiasNonlinPass>(); }); | |||
| @@ -3638,6 +3638,7 @@ void ShuffleShuffleRemovePass::apply(OptState& opt) const { | |||
| MIDOUT_E | |||
| } | |||
| #if CUDA_VERSION >= 10020 | |||
| /* ==================== FoldingConvBiasDimshufflePass ================= */ | |||
| const char* FoldingConvBiasDimshufflePass::name() const { | |||
| return mgb_cstr_log("folding conv bias dimshuffle pass"); | |||
| @@ -4068,20 +4069,17 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { | |||
| return true; | |||
| }; | |||
| MGB_MARK_USED_VAR(try_conv_reformat_nchw322nchw4); | |||
| MGB_MARK_USED_VAR(try_conv_reformat_nchw42nchw32); | |||
| auto on_opr = [&try_conv_dimshuffle_reshape_typecvt, | |||
| &try_conv_reformat_nchw42nchw32, | |||
| &try_conv_reformat_nchw42nhwc, | |||
| #if CUDA_VERSION >= 10020 | |||
| &try_conv_reformat_nchw322nchw4, | |||
| #endif | |||
| &rewriter](OperatorNodeBase* opr) { | |||
| if (!try_conv_dimshuffle_reshape_typecvt(opr) && | |||
| !try_conv_reformat_nchw42nchw32(opr) && | |||
| !try_conv_reformat_nchw42nhwc(opr) | |||
| #if CUDA_VERSION >= 10020 | |||
| && !try_conv_reformat_nchw322nchw4(opr) | |||
| #endif | |||
| ) { | |||
| rewriter.auto_replace_outputs(opr); | |||
| } | |||
| @@ -4091,6 +4089,7 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { | |||
| MIDOUT_E | |||
| } | |||
| #endif | |||
| /* ==================== PaddingChannelPass ================= */ | |||
| const char* PaddingChannelPass::name() const { | |||
| @@ -16,6 +16,10 @@ | |||
| #include "megbrain/opr/dnn/convolution.h" | |||
| #include "megbrain/opr/search_policy/algo_chooser_helper.h" | |||
| #if MGB_CUDA | |||
| #include <cuda.h> | |||
| #endif | |||
| namespace mgb { | |||
| namespace gopt { | |||
| @@ -427,11 +431,13 @@ namespace gopt { | |||
| void apply(OptState& opt) const override; | |||
| }; | |||
| #if CUDA_VERSION >= 10020 | |||
| class FoldingConvBiasDimshufflePass final : public Pass { | |||
| public: | |||
| const char* name() const override; | |||
| void apply(OptState& opt) const override; | |||
| }; | |||
| #endif | |||
| /*! | |||
| * \brief padding channel to enable fast int8/int4 support | |||
| @@ -4155,6 +4155,7 @@ TEST(TestGoptInference, WarpAndPreProcessCase1) { | |||
| MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5); | |||
| } | |||
| #if CUDA_VERSION >= 10020 | |||
| TEST(TestGoptInference, FoldingConvDimshuffle) { | |||
| REQUIRE_GPU(1); | |||
| auto cn = CompNode::load("gpu0"); | |||
| @@ -4307,7 +4308,6 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) { | |||
| MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse); | |||
| } | |||
| #if CUDA_VERSION >= 10020 | |||
| TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) { | |||
| REQUIRE_GPU(1); | |||
| auto cn = CompNode::load("gpu0"); | |||