GitOrigin-RevId: 2ae8d1608d
tags/v1.5.0
| @@ -65,8 +65,8 @@ void do_check_exec_common( | |||
| bias.to_string().c_str(), dst.to_string().c_str()); | |||
| megdnn_assert(bias.shape[2] == 1); | |||
| megdnn_assert(bias.shape[3] == 1); | |||
| } else if (param().format == param::ConvBias::Format::NHWC || | |||
| param().format == param::ConvBias::Format::NCHW4_NHWC) { | |||
| } else if (opr->param().format == param::ConvBias::Format::NHWC || | |||
| opr->param().format == param::ConvBias::Format::NCHW4_NHWC) { | |||
| megdnn_assert(bias.shape[0] == 1); | |||
| megdnn_assert(bias.shape[1] == 1); | |||
| megdnn_assert(bias.shape[2] == 1); | |||
| @@ -420,7 +420,8 @@ void TensorReformatPass::RelayoutPlaceholder::init_output_static_infer_desc() { | |||
| dst[4] = 32; | |||
| } else if (layout_type() == | |||
| RelayoutPlaceholder::LayoutType::NCHW_TO_NCHW64) { | |||
| mgb_assert(inp_shape.ndim == 4 && inp_shape[1] % 64 == 0, "%s", inp_shape.to_string().c_str()); | |||
| mgb_assert(inp_shape.ndim == 4 && inp_shape[1] % 64 == 0, "%s", | |||
| inp_shape.to_string().c_str()); | |||
| dst.ndim = 5; | |||
| dst[0] = inp_shape[0]; | |||
| dst[1] = inp_shape[1] / 64; | |||
| @@ -438,8 +439,6 @@ void TensorReformatPass::RelayoutPlaceholder::init_output_static_infer_desc() { | |||
| dst[4] = 32; | |||
| } else if (layout_type() == | |||
| RelayoutPlaceholder::LayoutType::NCHW4_TO_NCHW64) { | |||
| mgb_assert(layout_type() == | |||
| RelayoutPlaceholder::LayoutType::NCHW4_TO_NCHW64); | |||
| mgb_assert(inp_shape.ndim == 5 && inp_shape[1] % 16 == 0); | |||
| dst.ndim = 5; | |||
| dst[0] = inp_shape[0]; | |||
| @@ -499,18 +498,17 @@ void TensorReformatPass::RelayoutPlaceholder::init_output_static_infer_desc() { | |||
| } else if (layout_type() == | |||
| RelayoutPlaceholder::LayoutType::NHWC_TO_NCHW32) { | |||
| mgb_assert(inp_shape.ndim == 4 && inp_shape[3] % 32 == 0); | |||
| dst.ndim = 4; | |||
| dst.ndim = 5; | |||
| dst[0] = inp_shape[0]; | |||
| dst[1] = inp_shape[3] / 32; | |||
| dst[2] = inp_shape[1]; | |||
| dst[3] = inp_shape[2]; | |||
| dst[4] = 32; | |||
| } else if (layout_type() == | |||
| RelayoutPlaceholder::LayoutType::NHWC_TO_NCHW64) { | |||
| } else { | |||
| mgb_assert(layout_type() == | |||
| RelayoutPlaceholder::LayoutType::NHWC_TO_NCHW64); | |||
| mgb_assert(inp_shape.ndim == 4 && inp_shape[3] % 64 == 0); | |||
| dst.ndim = 4; | |||
| dst.ndim = 5; | |||
| dst[0] = inp_shape[0]; | |||
| dst[1] = inp_shape[3] / 64; | |||
| dst[2] = inp_shape[1]; | |||
| @@ -3729,21 +3727,6 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { | |||
| return y1.node(); | |||
| }; | |||
| auto nhwc2nchw64 = [](VarNode* inp) -> VarNode* { | |||
| mgb_assert(inp->shape().ndim == 4); | |||
| auto x = SymbolVar(inp); | |||
| auto xshp = opr::GetVarShape::make(x); | |||
| auto cv = [&x](int v) { return x.make_scalar(v); }; | |||
| auto sub = [&xshp, &cv](int idx) { | |||
| return opr::IndexAt::make(xshp, {{0, cv(idx)}}); | |||
| }; | |||
| auto tshp = opr::Concat::make( | |||
| {sub(0), sub(1), sub(2), sub(3) / 64, cv(64)}, 0); | |||
| auto y0 = opr::Reshape::make(x, tshp); | |||
| auto y1 = opr::Dimshuffle::make(y0, {0, 3, 1, 2, 4}); | |||
| return y1.node(); | |||
| }; | |||
| auto try_conv_dimshuffle_reshape_typecvt = [&rewriter, &readers, | |||
| &nchw42nchw]( | |||
| OperatorNodeBase* opr) { | |||
| @@ -3915,31 +3898,29 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { | |||
| return true; | |||
| }; | |||
| auto try_conv_reformat_nchw42nchw64 = [&rewriter, &nchw42nhwc, &nhwc2nchw64, | |||
| &readers](OperatorNodeBase* opr) { | |||
| auto try_conv_reformat_nchw42nhwc = [&rewriter, &nchw42nhwc, | |||
| &readers](OperatorNodeBase* opr) { | |||
| ThinHashSet<OperatorNodeBase*> opr_set; | |||
| ThinHashSet<OperatorNodeBase*> reader_set; | |||
| // check reshape | |||
| auto reshape1 = | |||
| try_cast_as_op<opr::Reshape>(opr); | |||
| if (reshape1 == nullptr) | |||
| auto reshape = try_cast_as_op<opr::Reshape>(opr); | |||
| if (reshape == nullptr) | |||
| return false; | |||
| opr_set.insert(opr); | |||
| // check dimshuffle | |||
| auto shuffle = try_cast_as_op<opr::Dimshuffle>( | |||
| reshape1->input(0)->owner_opr()); | |||
| reshape->input(0)->owner_opr()); | |||
| if (shuffle == nullptr) | |||
| return false; | |||
| auto&& param = shuffle->param(); | |||
| if (param.pattern_len != 6) | |||
| if (param.pattern_len != 5) | |||
| return false; | |||
| bool is_nchw42nchw64 = param.pattern[0] == 0 && param.pattern[1] == 1 && | |||
| param.pattern[2] == 3 && param.pattern[3] == 4 && | |||
| param.pattern[4] == 2 && param.pattern[5] == 5 && | |||
| shuffle->output(0)->shape()[5] == 4 && | |||
| shuffle->output(0)->shape()[4] == 16; | |||
| if (!is_nchw42nchw64) | |||
| bool is_nchw42nhwc = param.pattern[0] == 0 && param.pattern[1] == 2 && | |||
| param.pattern[2] == 3 && param.pattern[3] == 1 && | |||
| param.pattern[4] == 4 && | |||
| shuffle->output(0)->shape()[4] == 4; | |||
| if (!is_nchw42nhwc) | |||
| return false; | |||
| opr_set.insert(shuffle); | |||
| for (auto&& i : readers[shuffle]) { | |||
| @@ -3948,20 +3929,8 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { | |||
| } | |||
| } | |||
| // check reshape | |||
| auto reshape2 = | |||
| try_cast_as_op<opr::Reshape>(shuffle->input(0)->owner_opr()); | |||
| if (reshape2 == nullptr) | |||
| return false; | |||
| opr_set.insert(reshape2); | |||
| for (auto&& i : readers[reshape2]) { | |||
| if (i.second & DepType::DEV_VALUE) { | |||
| reader_set.insert(i.first); | |||
| } | |||
| } | |||
| auto typecvt = | |||
| try_cast_as_op<opr::TypeCvt>(reshape2->input(0)->owner_opr()); | |||
| try_cast_as_op<opr::TypeCvt>(shuffle->input(0)->owner_opr()); | |||
| if (typecvt == nullptr) | |||
| return false; | |||
| auto in_dtype = typecvt->input(0)->dtype(), | |||
| @@ -3972,6 +3941,11 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { | |||
| if (!is_s82s4) | |||
| return false; | |||
| opr_set.insert(typecvt); | |||
| for (auto&& i : readers[typecvt]) { | |||
| if (i.second & DepType::DEV_VALUE) { | |||
| reader_set.insert(i.first); | |||
| } | |||
| } | |||
| // check conv bias | |||
| auto conv_bias = | |||
| @@ -4006,11 +3980,10 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { | |||
| auto conv_bias_shuffle = opr::ConvBias::make( | |||
| src, filter, new_bias, new_param, conv_bias->execution_policy(), | |||
| OperatorNodeConfig{out_dtype}); | |||
| auto new_var = nhwc2nchw64(conv_bias_shuffle.node()); | |||
| rewriter.replace_var( | |||
| opr->output(0), new_var, | |||
| opr->output(0), conv_bias_shuffle.node(), | |||
| mgb_cstr_log("replace conv_bias + " | |||
| "reformat to conv_bias(NCHW4_NCHW64)")); | |||
| "reformat to conv_bias(NCHW4_NHWC)")); | |||
| return true; | |||
| }; | |||
| @@ -4098,14 +4071,14 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { | |||
| auto on_opr = [&try_conv_dimshuffle_reshape_typecvt, | |||
| &try_conv_reformat_nchw42nchw32, | |||
| &try_conv_reformat_nchw42nchw64, | |||
| &try_conv_reformat_nchw42nhwc, | |||
| #if CUDA_VERSION >= 10020 | |||
| &try_conv_reformat_nchw322nchw4, | |||
| #endif | |||
| &rewriter](OperatorNodeBase* opr) { | |||
| if (!try_conv_dimshuffle_reshape_typecvt(opr) && | |||
| !try_conv_reformat_nchw42nchw32(opr) && | |||
| !try_conv_reformat_nchw42nchw64(opr) | |||
| !try_conv_reformat_nchw42nchw32(opr) && | |||
| !try_conv_reformat_nchw42nhwc(opr) | |||
| #if CUDA_VERSION >= 10020 | |||
| && !try_conv_reformat_nchw322nchw4(opr) | |||
| #endif | |||
| @@ -4546,6 +4519,9 @@ VarNode* EnableNCHW64Pass::on_graph_endpoint_var(VarNode* new_var, | |||
| case Format::NCHW64: | |||
| type = LayoutType::NCHW64_TO_NCHW; | |||
| break; | |||
| case Format::NHWC: | |||
| type = LayoutType::NHWC_TO_NCHW; | |||
| break; | |||
| default: | |||
| mgb_throw(AssertionError, | |||
| "format(%d) is not supported, related var " | |||
| @@ -4980,7 +4956,7 @@ EnableNCHW64Pass::make_nchw64_converter() { | |||
| case Format::NHWC: | |||
| inps[1] = RelayoutPlaceholder::make( | |||
| inps[1], RelayoutPlaceholder::LayoutType:: | |||
| NCHW_TO_NHWC) | |||
| NHWC_TO_NCHW4) | |||
| .node(); | |||
| break; | |||
| case Format::NCHW32: | |||
| @@ -4404,10 +4404,10 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NHWC) { | |||
| }; | |||
| auto x = mkvar("x", {32, 4, 23, 40}, dtype::QuantizedS8(2.5f)), | |||
| w = mkcvar("w", {64, 4, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
| b = mkcvar("b", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f)), | |||
| w1 = mkcvar("w1", {64, 64, 3, 3}, dtype::QuantizedS4(1.234f)), | |||
| b1 = mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32(12.34567f*1.234f)); | |||
| w = mkcvar("w", {32, 4, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
| b = mkcvar("b", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f)), | |||
| w1 = mkcvar("w1", {32, 32, 3, 3}, dtype::QuantizedS4(1.234f)), | |||
| b1 = mkcvar("b1", {1, 32, 1, 1}, dtype::QuantizedS32(12.34567f*1.234f)); | |||
| opr::ConvBias::Param param; | |||
| param.format = opr::ConvBias::Param::Format::NCHW; | |||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||
| @@ -4438,7 +4438,7 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NHWC) { | |||
| ->writeto_fpath(output_file( | |||
| "TestGoptInference.FoldingConvDimshuffleNCHW4NHWC.json")); | |||
| size_t nr_dimshuffle = find_opr_num<opr::TypeCvt>(y_fuse); | |||
| ASSERT_EQ(3u, find_opr_num<opr::Dimshuffle>(y_fuse)); | |||
| ASSERT_EQ(2u, nr_dimshuffle); | |||
| bool found = false; | |||
| cg::DepOprIter{[&found](cg::OperatorNodeBase* opr) { | |||
| if (!found && opr->same_type<opr::ConvBias>()) { | |||
| @@ -4735,101 +4735,6 @@ TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) { | |||
| MGB_ASSERT_TENSOR_EQ(t1, t2); | |||
| } | |||
| TEST(TestGoptInference, PaddingChannelsB4) { | |||
| REQUIRE_GPU(1); | |||
| auto cn = CompNode::load("gpu0"); | |||
| cn.activate(); | |||
| REQUIRE_CUDA_COMPUTE_CAPABILITY(7, 5); | |||
| HostTensorGenerator<dtype::Int8> gen; | |||
| auto graph = ComputingGraph::make(); | |||
| graph->options().graph_opt_level = 0; | |||
| auto mkvar = [&](const char* name, const TensorShape& shp, | |||
| const DType& dtype) { | |||
| return opr::TypeCvt::make( | |||
| opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||
| dtype); | |||
| }; | |||
| auto mkcvar = [&](const char* name, const TensorShape& shp, | |||
| const DType& dtype) { | |||
| return opr::TypeCvt::make( | |||
| opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||
| .rename(name), | |||
| dtype); | |||
| }; | |||
| auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)), | |||
| w = mkcvar("w", {16, 3, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
| b = mkcvar("b", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
| opr::ConvBias::Param param; | |||
| param.format = opr::ConvBias::Param::Format::NCHW; | |||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||
| param.stride_h = param.stride_w = 1; | |||
| param.pad_h = param.pad_w = 1; | |||
| auto y = opr::ConvBias::make(x, w, b, param, {}, | |||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
| y = opr::TypeCvt::make(y, dtype::Quantized4Asymm{20.f, 8}); | |||
| opr::Pooling::Param pool; | |||
| pool.format = opr::Pooling::Param::Format::NCHW; | |||
| y = opr::Pooling::make(y, pool); | |||
| auto w1 = mkcvar("w1", {48, 16, 3, 3}, dtype::QuantizedS4(1.234f)), | |||
| b1 = mkcvar("b1", {1, 48, 1, 1}, dtype::QuantizedS32(20.f*1.234f)); | |||
| auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | |||
| OperatorNodeConfig{dtype::Quantized4Asymm(20.f, 8)}); | |||
| auto w2 = mkcvar("w2", {48, 48, 3, 3}, dtype::QuantizedS4(1.234f)), | |||
| b2 = mkcvar("b2", {1, 48, 1, 1}, dtype::QuantizedS32(20.f*1.234f)); | |||
| auto y2 = opr::ConvBias::make( | |||
| y1, w2, b2, param, {}, | |||
| OperatorNodeConfig{dtype::Quantized4Asymm(20.f, 8)}); | |||
| auto w3 = mkcvar("w2", {16, 48, 3, 3}, dtype::QuantizedS4(1.234f)), | |||
| b3 = mkcvar("b2", {1, 16, 1, 1}, dtype::QuantizedS32(20.f*1.234f)); | |||
| auto y3 = opr::ConvBias::make( | |||
| y2, w3, b3, param, {}, | |||
| OperatorNodeConfig{dtype::Quantized4Asymm(20.f, 8)}); | |||
| using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; | |||
| auto y4 = opr::ElemwiseMultiType::make( | |||
| {y, y3}, {ElemMultiMode::QFUSE_ADD_RELU}, | |||
| OperatorNodeConfig{dtype::Quantized4Asymm{20.f, 7}}); | |||
| y4 = opr::TypeCvt::make(y4, dtype::Float32()); | |||
| SymbolVar y4_pad; | |||
| unpack_vector(gopt::GraphOptimizer{} | |||
| .add_pass<gopt::PaddingChannelPass>() | |||
| .add_pass<gopt::ParamFusePass>() | |||
| .apply({{y4}}) | |||
| .endpoint_vars(), | |||
| y4_pad); | |||
| ASSERT_EQ(y4_pad.node()->shape()[1], y4.node()->shape()[1]); | |||
| SmallVector<cg::OperatorNodeBase*> oprs; | |||
| auto cb1 = [&oprs](cg::OperatorNodeBase* opr) { | |||
| if (opr->same_type<opr::ConvBias>()) { | |||
| oprs.push_back(opr); | |||
| } | |||
| }; | |||
| cg::DepOprIter{cb1}.add(y4_pad.node()->owner_opr()); | |||
| ASSERT_EQ(oprs.size(), 4); | |||
| ASSERT_EQ(oprs[0]->output(0)->shape()[1], 16); | |||
| ASSERT_EQ(oprs[1]->output(0)->shape()[1], 64); | |||
| ASSERT_EQ(oprs[2]->output(0)->shape()[1], 64); | |||
| ASSERT_EQ(oprs[3]->output(0)->shape()[1], 16); | |||
| size_t nr_concat = find_opr_num<opr::Concat>(y4_pad); | |||
| ASSERT_EQ(nr_concat, 1); | |||
| cg::OperatorNodeBase* concat = nullptr; | |||
| auto cb2 = [&concat](cg::OperatorNodeBase* opr) { | |||
| if (opr->same_type<opr::Concat>()) { | |||
| concat = opr; | |||
| } | |||
| }; | |||
| cg::DepOprIter{cb2}.add(y4_pad.node()->owner_opr()); | |||
| ASSERT_EQ(oprs[0]->input(0)->owner_opr(), concat); | |||
| HostTensorND t1, t2; | |||
| auto func1 = graph->compile({make_callback_copy(y4, t1)}); | |||
| func1->execute(); | |||
| auto func2 = graph->compile({make_callback_copy(y4_pad, t2)}); | |||
| func2->execute(); | |||
| MGB_ASSERT_TENSOR_EQ(t1, t2); | |||
| } | |||
| #endif | |||