GitOrigin-RevId: 94c719bb5c
tags/v1.5.0
| @@ -3624,4 +3624,344 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { | |||
| MIDOUT_E | |||
| } | |||
| /* ==================== PaddingChaannelPass ================= */ | |||
| const char* PaddingChannelPass::name() const { | |||
| return mgb_cstr_log("padding output channel to multiple of 4/32"); | |||
| } | |||
| void PaddingChannelPass::apply(OptState& opt) const { | |||
| MIDOUT_B("PaddingChannelPassPass::apply"); | |||
| // do not check shape | |||
| opt.set_var_replace_check_flag(VarReplaceCheckFlag::CHECK_ALL ^ | |||
| VarReplaceCheckFlag::CHECK_SHAPE); | |||
| ThinHashSet<OperatorNodeBase*> padding_oprs; | |||
| ThinHashMap<Typeinfo*, thin_function<OperatorNodeBase*( | |||
| OperatorNodeBase*, const VarNodeArray&)>> | |||
| opr_replace_funcs; | |||
| auto rewriter = opt.graph().make_rewriter(); | |||
| auto pad_in_channels = [](VarNode* inp, size_t pad_channels) -> VarNode* { | |||
| mgb_assert(inp->shape().ndim == 4); | |||
| mgb_assert(inp->dtype().enumv() == DTypeEnum::QuantizedS8 || | |||
| inp->dtype().enumv() == DTypeEnum::QuantizedS32); | |||
| TensorShape shape{inp->shape()[0], pad_channels, inp->shape()[2], | |||
| inp->shape()[3]}; | |||
| std::shared_ptr<HostTensorND> host_val = std::make_shared<HostTensorND>( | |||
| inp->comp_node(), shape, inp->dtype()); | |||
| auto ptr = host_val->raw_ptr(); | |||
| std::memset(ptr, 0, shape.total_nr_elems() * inp->dtype().size()); | |||
| auto padding = | |||
| opr::ImmutableTensor::make(*inp->owner_graph(), *host_val); | |||
| auto out = opr::Concat::make({inp, padding}, 1); | |||
| return out.node(); | |||
| }; | |||
| auto pad_out_channels = [](VarNode* inp, size_t pad_channels) -> VarNode* { | |||
| mgb_assert(inp->shape().ndim == 4); | |||
| mgb_assert(inp->dtype().enumv() == DTypeEnum::QuantizedS8 || | |||
| inp->dtype().enumv() == DTypeEnum::QuantizedS32); | |||
| TensorShape shape{pad_channels, inp->shape()[1], inp->shape()[2], | |||
| inp->shape()[3]}; | |||
| std::shared_ptr<HostTensorND> host_val = std::make_shared<HostTensorND>( | |||
| inp->comp_node(), shape, inp->dtype()); | |||
| auto ptr = host_val->raw_ptr(); | |||
| std::memset(ptr, 0, shape.total_nr_elems() * inp->dtype().size()); | |||
| auto padding = | |||
| opr::ImmutableTensor::make(*inp->owner_graph(), *host_val); | |||
| auto out = opr::Concat::make({inp, padding}, 0); | |||
| return out.node(); | |||
| }; | |||
| auto extract_subtensor = [](VarNode* inp, | |||
| size_t orig_channels) -> VarNode* { | |||
| mgb_assert(inp->shape().ndim == 4); | |||
| auto x = SymbolVar(inp); | |||
| auto cv = [&x](int v) { return x.make_scalar(v); }; | |||
| using AIdx = opr::Subtensor::AxisIndexer; | |||
| auto sub = opr::Subtensor::make( | |||
| x, {AIdx::make_interval(0, None, None, cv(1)), | |||
| AIdx::make_interval(1, None, cv(orig_channels), None), | |||
| AIdx::make_interval(2, None, None, cv(1)), | |||
| AIdx::make_interval(3, None, None, cv(1))}); | |||
| return sub.node(); | |||
| }; | |||
| // padding policy for conv bias with data type qint8 | |||
| auto padding_policy_qint8 = [&padding_oprs, &pad_in_channels, | |||
| &pad_out_channels]( | |||
| OperatorNodeBase* opr, | |||
| const VarNodeArray& new_inp) { | |||
| mgb_assert(opr->input().size() == new_inp.size()); | |||
| mgb_assert(new_inp.size() == 3); | |||
| mgb_assert(opr->input(1)->shape().eq_shape(new_inp[1]->shape())); | |||
| auto inps = new_inp; | |||
| size_t out_channels = opr->input(1)->shape()[0]; | |||
| size_t in_channels = opr->input(1)->shape()[1]; | |||
| size_t new_in_channels = new_inp[0]->shape()[1]; | |||
| // pad input channels | |||
| if (padding_oprs.count(opr->input(0)->owner_opr())) { | |||
| size_t pad_channels = new_in_channels - in_channels; | |||
| inps[1] = pad_in_channels(new_inp[1], pad_channels); | |||
| } else { | |||
| size_t pad_channels = 0; | |||
| mgb_assert(new_in_channels == in_channels); | |||
| if (in_channels <= 16) { | |||
| if (in_channels % 4) | |||
| pad_channels = 4 - (in_channels % 4); // pad to use dp4a | |||
| } else { | |||
| if (in_channels % 32) | |||
| pad_channels = | |||
| 32 - (in_channels % 32); // pad to use tensorcore | |||
| } | |||
| if (pad_channels > 0) { | |||
| inps[0] = pad_in_channels(new_inp[0], pad_channels); | |||
| inps[1] = pad_in_channels(new_inp[1], pad_channels); | |||
| } | |||
| } | |||
| out_channels = inps[1]->shape()[0]; | |||
| in_channels = inps[1]->shape()[1]; | |||
| size_t pad_channels = 0; | |||
| if (in_channels <= 16) { | |||
| if (out_channels % 4) | |||
| pad_channels = 4 - (out_channels % 4); | |||
| } else { | |||
| if (out_channels <= 16) { | |||
| if (out_channels % 4) | |||
| pad_channels = 4 - (out_channels % 4); | |||
| } else { | |||
| if (out_channels % 32) | |||
| pad_channels = 32 - (out_channels % 32); | |||
| } | |||
| } | |||
| if (pad_channels > 0) { | |||
| inps[1] = pad_out_channels(inps[1], pad_channels); | |||
| inps[2] = pad_in_channels(inps[2], pad_channels); | |||
| padding_oprs.insert(opr); | |||
| } | |||
| return serialization::copy_opr_shallow(*opr, inps, opr->config()); | |||
| }; | |||
| // padding policy for conv bias with data type qint4 and quint4 | |||
| auto padding_policy_int4 = [&padding_oprs, &pad_in_channels, | |||
| &pad_out_channels]( | |||
| OperatorNodeBase* opr, | |||
| const VarNodeArray& new_inp) { | |||
| mgb_assert(opr->input().size() == new_inp.size()); | |||
| mgb_assert(new_inp.size() == 3); | |||
| mgb_assert(opr->input(1)->shape().eq_shape(new_inp[1]->shape())); | |||
| auto inps = new_inp; | |||
| return serialization::copy_opr_shallow(*opr, inps, opr->config()); | |||
| }; | |||
| opr_replace_funcs[opr::ConvBiasForward::typeinfo()] = | |||
| [&padding_oprs, &padding_policy_qint8, &padding_policy_int4]( | |||
| OperatorNodeBase* opr, const VarNodeArray& new_inp) { | |||
| if (opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8) { | |||
| return padding_policy_qint8(opr, new_inp); | |||
| } else if (opr->input(0)->dtype().enumv() == | |||
| DTypeEnum::QuantizedS4 || | |||
| opr->input(0)->dtype().enumv() == | |||
| DTypeEnum::Quantized4Asymm) { | |||
| return padding_policy_int4(opr, new_inp); | |||
| } else { | |||
| mgb_assert( | |||
| padding_oprs.count(opr->input(0)->owner_opr()) == 0, | |||
| "conv bias operator for data type(%s) cannot be " | |||
| "padded channel. " | |||
| "consumer(%s), producer(%s)", | |||
| opr->input(0)->dtype().name(), opr->cname(), | |||
| opr->input(0)->owner_opr()->cname()); | |||
| return serialization::copy_opr_shallow(*opr, new_inp, | |||
| opr->config()); | |||
| } | |||
| }; | |||
| opr_replace_funcs[opr::ConvolutionBackwardData::typeinfo()] = | |||
| [&padding_oprs, &pad_in_channels, &pad_out_channels]( | |||
| OperatorNodeBase* opr, const VarNodeArray& new_inp) { | |||
| if (opr->input(1)->dtype().enumv() != DTypeEnum::QuantizedS8) { | |||
| mgb_assert( | |||
| padding_oprs.count(opr->input(0)->owner_opr()) == 0, | |||
| "conv bwd data operator for data type(%s) cannot " | |||
| "be " | |||
| "padded channel. " | |||
| "consumer(%s), producer(%s)", | |||
| opr->input(0)->dtype().name(), opr->cname(), | |||
| opr->input(0)->owner_opr()->cname()); | |||
| return serialization::copy_opr_shallow(*opr, new_inp, | |||
| opr->config()); | |||
| } | |||
| mgb_assert(opr->input().size() == new_inp.size()); | |||
| mgb_assert(new_inp.size() == 2, | |||
| "deconv (conv bwd data) operator for inference can " | |||
| "only have 2 input vars(got:%zu)", | |||
| new_inp.size()); | |||
| mgb_assert( | |||
| opr->input(0)->shape().eq_shape(new_inp[0]->shape())); | |||
| auto inps = new_inp; | |||
| size_t out_channels = opr->input(0)->shape()[0]; | |||
| size_t in_channels = opr->input(0)->shape()[1]; | |||
| size_t new_out_channels = new_inp[1]->shape()[1]; | |||
| // pad output channels | |||
| if (padding_oprs.count(opr->input(1)->owner_opr())) { | |||
| size_t pad_channels = new_out_channels - out_channels; | |||
| inps[0] = pad_out_channels(new_inp[0], pad_channels); | |||
| } else { | |||
| size_t pad_channels = 0; | |||
| if (out_channels % 4) | |||
| pad_channels = 4 - (out_channels % 4); | |||
| if (pad_channels > 0) { | |||
| inps[0] = pad_out_channels(new_inp[0], pad_channels); | |||
| inps[1] = pad_in_channels(new_inp[1], pad_channels); | |||
| } | |||
| } | |||
| out_channels = inps[0]->shape()[0]; | |||
| in_channels = inps[0]->shape()[1]; | |||
| // pad input channels | |||
| size_t pad_channels = 0; | |||
| if (in_channels % 4) | |||
| pad_channels = 4 - (in_channels % 4); | |||
| if (pad_channels > 0) { | |||
| inps[0] = pad_in_channels(inps[0], pad_channels); | |||
| padding_oprs.insert(opr); | |||
| } | |||
| return serialization::copy_opr_shallow(*opr, inps, | |||
| opr->config()); | |||
| }; | |||
| auto replace_format_aware_opr = [&padding_oprs]( | |||
| OperatorNodeBase* opr, | |||
| const VarNodeArray& new_inp) { | |||
| if (opr->input(0)->dtype().enumv() != DTypeEnum::QuantizedS8 && | |||
| opr->input(0)->dtype().enumv() != DTypeEnum::QuantizedS4 && | |||
| opr->input(0)->dtype().enumv() != DTypeEnum::Quantized4Asymm) { | |||
| mgb_assert(padding_oprs.count(opr->input(0)->owner_opr()) == 0, | |||
| "operator(type:%s,name:%s) for data type(%s) cannot be " | |||
| "padded channel. extra info:" | |||
| "consumer(%s), producer(%s)", | |||
| opr->dyn_typeinfo()->name, opr->cname(), | |||
| opr->input(0)->dtype().name(), opr->cname(), | |||
| opr->input(0)->owner_opr()->cname()); | |||
| return serialization::copy_opr_shallow(*opr, new_inp, | |||
| opr->config()); | |||
| } | |||
| mgb_assert(opr->input().size() == new_inp.size()); | |||
| if (padding_oprs.count(opr->input(0)->owner_opr())) { | |||
| padding_oprs.insert(opr); | |||
| } | |||
| return serialization::copy_opr_shallow(*opr, new_inp, opr->config()); | |||
| }; | |||
| opr_replace_funcs[opr::PoolingForward::typeinfo()] = | |||
| replace_format_aware_opr; | |||
| opr_replace_funcs[opr::WarpPerspectiveForward::typeinfo()] = | |||
| replace_format_aware_opr; | |||
| auto replace_elemwise_like_opr = [&padding_oprs, &extract_subtensor]( | |||
| OperatorNodeBase* opr, | |||
| const VarNodeArray& new_inp) { | |||
| mgb_assert(opr->input().size() == new_inp.size()); | |||
| bool have_padding_inp = false; | |||
| bool padding_all_inps = true; | |||
| bool same_padding = true; | |||
| size_t channels_after_padding = 0; | |||
| for (auto&& cur_inp : opr->input()) { | |||
| bool padding_cur_inp = padding_oprs.count(cur_inp->owner_opr()) > 0; | |||
| if (padding_cur_inp) { | |||
| if (!have_padding_inp) | |||
| have_padding_inp = true; | |||
| if (channels_after_padding == 0) { | |||
| channels_after_padding = cur_inp->shape()[1]; | |||
| } else { | |||
| same_padding = | |||
| channels_after_padding == cur_inp->shape()[1]; | |||
| } | |||
| } | |||
| if (padding_all_inps && (!padding_cur_inp || !same_padding)) | |||
| padding_all_inps = false; | |||
| } | |||
| if (have_padding_inp && !padding_all_inps) { | |||
| auto inps = new_inp; | |||
| for (size_t i = 0; i < new_inp.size(); ++i) { | |||
| auto cur_inp = opr->input(i); | |||
| bool padding_cur_inp = | |||
| padding_oprs.count(cur_inp->owner_opr()) > 0; | |||
| if (padding_cur_inp) { | |||
| size_t orig_channels = cur_inp->shape()[1]; | |||
| inps[i] = extract_subtensor(inps[i], orig_channels); | |||
| } | |||
| } | |||
| return serialization::copy_opr_shallow(*opr, inps, opr->config()); | |||
| } | |||
| if (padding_all_inps) { | |||
| padding_oprs.insert(opr); | |||
| } | |||
| return serialization::copy_opr_shallow(*opr, new_inp, opr->config()); | |||
| }; | |||
| opr_replace_funcs[opr::ElemwiseMultiType::typeinfo()] = | |||
| replace_elemwise_like_opr; | |||
| opr_replace_funcs[opr::Elemwise::typeinfo()] = replace_elemwise_like_opr; | |||
| opr_replace_funcs[opr::TypeCvt::typeinfo()] = replace_elemwise_like_opr; | |||
| auto replace_nonpadding_oprs = [&padding_oprs, &extract_subtensor]( | |||
| OperatorNodeBase* opr, | |||
| const VarNodeArray& new_inp) { | |||
| mgb_assert(opr->input().size() == new_inp.size()); | |||
| bool have_padding_inp = false; | |||
| auto inps = new_inp; | |||
| for (size_t i = 0; i < new_inp.size(); ++i) { | |||
| auto cur_inp = opr->input(i); | |||
| bool padding_cur_inp = padding_oprs.count(cur_inp->owner_opr()) > 0; | |||
| if (padding_cur_inp) { | |||
| if (!have_padding_inp) | |||
| have_padding_inp = true; | |||
| size_t orig_channels = cur_inp->shape()[1]; | |||
| inps[i] = extract_subtensor(inps[i], orig_channels); | |||
| } | |||
| } | |||
| return serialization::copy_opr_shallow(*opr, inps, opr->config()); | |||
| }; | |||
| opr_replace_funcs[opr::Reshape::typeinfo()] = replace_nonpadding_oprs; | |||
| opr_replace_funcs[opr::GetVarShape::typeinfo()] = replace_nonpadding_oprs; | |||
| opr_replace_funcs[opr::Concat::typeinfo()] = replace_nonpadding_oprs; | |||
| auto on_opr = [&opt, &rewriter, &opr_replace_funcs, | |||
| &extract_subtensor](OperatorNodeBase* opr) { | |||
| auto it = opr_replace_funcs.find(opr->dyn_typeinfo()); | |||
| if (it != opr_replace_funcs.end()) { | |||
| VarNodeArray new_inp; | |||
| new_inp.reserve(opr->input().size()); | |||
| for (auto&& inp : opr->input()) { | |||
| new_inp.push_back(rewriter.get_var(inp)); | |||
| } | |||
| auto new_opr = (it->second)(opr, new_inp); | |||
| auto &&out0 = opr->output(), &&out1 = new_opr->output(); | |||
| mgb_assert(out0.size() == out1.size(), | |||
| "bad opr replace: src=%s{%s} dst=%s{%s}, " | |||
| "src.size=%zu " | |||
| "dst.size=%zu", | |||
| opr->cname(), opr->dyn_typeinfo()->name, | |||
| new_opr->cname(), new_opr->dyn_typeinfo()->name, | |||
| out0.size(), out1.size()); | |||
| for (size_t i = 0; i < out0.size(); ++i) { | |||
| if (!out0[i]->contain_flag(VarNode::Flag::VOLATILE_CONTENT)) { | |||
| mgb_assert(!out1[i]->contain_flag( | |||
| VarNode::Flag::VOLATILE_CONTENT)); | |||
| auto src = out0[i]; | |||
| auto dst = out1[i]; | |||
| if (opt.graph().endpoint_contain(src) && | |||
| !src->shape().eq_shape(dst->shape())) { | |||
| size_t orig_channels = src->shape()[1]; | |||
| dst = extract_subtensor(dst, orig_channels); | |||
| } | |||
| rewriter.replace_var(src, dst, nullptr); | |||
| } | |||
| } | |||
| } else { | |||
| rewriter.auto_replace_outputs(opr); | |||
| } | |||
| }; | |||
| opt.graph().iter(on_opr); | |||
| rewriter.apply_inplace(); | |||
| MIDOUT_E | |||
| } | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
| @@ -409,6 +409,16 @@ namespace gopt { | |||
| void apply(OptState& opt) const override; | |||
| }; | |||
| /*! | |||
| * \brief padding channel to enable fast int8/int4 support | |||
| * assume input network is built in NCHW tensor format | |||
| */ | |||
| class PaddingChannelPass final : public Pass { | |||
| public: | |||
| const char* name() const override; | |||
| void apply(OptState& opt) const override; | |||
| }; | |||
| } // namespace gopt | |||
| } // namespace mgb | |||
| @@ -4178,6 +4178,313 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) { | |||
| MGB_ASSERT_TENSOR_EQ(host_y_fuse, host_y_non_fuse); | |||
| } | |||
| #endif | |||
| TEST(TestGoptInference, PaddingChannels) { | |||
| REQUIRE_GPU(1); | |||
| auto cn = CompNode::load("gpu0"); | |||
| cn.activate(); | |||
| auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||
| auto sm_ver = prop.major * 10 + prop.minor; | |||
| if (sm_ver < 61) { | |||
| printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||
| "expected: %d)\n", | |||
| sm_ver, 61); | |||
| return; | |||
| } | |||
| HostTensorGenerator<dtype::Int8> gen; | |||
| auto graph = ComputingGraph::make(); | |||
| graph->options().graph_opt_level = 0; | |||
| auto mkvar = [&](const char* name, const TensorShape& shp, | |||
| const DType& dtype) { | |||
| return opr::TypeCvt::make( | |||
| opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||
| dtype); | |||
| }; | |||
| auto mkcvar = [&](const char* name, const TensorShape& shp, | |||
| const DType& dtype) { | |||
| return opr::TypeCvt::make( | |||
| opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||
| .rename(name), | |||
| dtype); | |||
| }; | |||
| auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)), | |||
| w = mkcvar("w", {20, 3, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
| b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
| opr::ConvBias::Param param; | |||
| param.format = opr::ConvBias::Param::Format::NCHW; | |||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||
| param.stride_h = param.stride_w = 1; | |||
| param.pad_h = param.pad_w = 1; | |||
| auto y = opr::ConvBias::make(x, w, b, param, {}, | |||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
| auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
| b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
| auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | |||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
| auto w2 = mkcvar("w2", {20, 24, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
| b2 = mkcvar("b2", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
| auto y2 = opr::ConvBias::make(y1, w2, b2, param, {}, | |||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
| using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode; | |||
| auto y3 = opr::ElemwiseMultiType::make( | |||
| {y, y2}, {ElemMultiMode::QFUSE_ADD_RELU}, | |||
| OperatorNodeConfig{dtype::QuantizedS8{1.2f}}); | |||
| y3 = opr::TypeCvt::make(y3, dtype::Float32()); | |||
| SymbolVar y3_pad; | |||
| unpack_vector(gopt::GraphOptimizer{} | |||
| .add_pass<gopt::PaddingChannelPass>() | |||
| .apply({{y3}}) | |||
| .endpoint_vars(), | |||
| y3_pad); | |||
| ASSERT_EQ(y3_pad.node()->shape()[1], y3.node()->shape()[1]); | |||
| SmallVector<cg::OperatorNodeBase*> oprs; | |||
| auto cb = [&oprs](cg::OperatorNodeBase* opr) { | |||
| if (opr->same_type<opr::ConvBias>()) { | |||
| oprs.push_back(opr); | |||
| } | |||
| }; | |||
| cg::DepOprIter{cb}.add(y3_pad.node()->owner_opr()); | |||
| ASSERT_EQ(oprs.size(), 3); | |||
| ASSERT_EQ(oprs[0]->output(0)->shape()[1], 20); | |||
| ASSERT_EQ(oprs[1]->output(0)->shape()[1], 32); | |||
| ASSERT_EQ(oprs[2]->output(0)->shape()[1], 32); | |||
| HostTensorND t1, t2; | |||
| auto func1 = graph->compile({make_callback_copy(y3, t1)}); | |||
| func1->execute(); | |||
| auto func2 = graph->compile({make_callback_copy(y3_pad, t2)}); | |||
| func2->execute(); | |||
| MGB_ASSERT_TENSOR_EQ(t1, t2); | |||
| } | |||
| TEST(TestGoptInference, ConcatAfterPaddingChannels) { | |||
| REQUIRE_GPU(1); | |||
| auto cn = CompNode::load("gpu0"); | |||
| cn.activate(); | |||
| auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||
| auto sm_ver = prop.major * 10 + prop.minor; | |||
| if (sm_ver < 61) { | |||
| printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||
| "expected: %d)\n", | |||
| sm_ver, 61); | |||
| return; | |||
| } | |||
| HostTensorGenerator<dtype::Int8> gen; | |||
| auto graph = ComputingGraph::make(); | |||
| graph->options().graph_opt_level = 0; | |||
| auto mkvar = [&](const char* name, const TensorShape& shp, | |||
| const DType& dtype) { | |||
| return opr::TypeCvt::make( | |||
| opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||
| dtype); | |||
| }; | |||
| auto mkcvar = [&](const char* name, const TensorShape& shp, | |||
| const DType& dtype) { | |||
| return opr::TypeCvt::make( | |||
| opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||
| .rename(name), | |||
| dtype); | |||
| }; | |||
| auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)), | |||
| w = mkcvar("w", {18, 3, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
| b = mkcvar("b", {1, 18, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
| opr::ConvBias::Param param; | |||
| param.format = opr::ConvBias::Param::Format::NCHW; | |||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||
| param.stride_h = param.stride_w = 1; | |||
| param.pad_h = param.pad_w = 1; | |||
| auto y = opr::ConvBias::make(x, w, b, param, {}, | |||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
| auto w1 = mkcvar("w1", {18, 18, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
| b1 = mkcvar("b1", {1, 18, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
| auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | |||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
| // concat at batch dim | |||
| auto y2 = opr::Concat::make({y, y1}, 0); | |||
| y2 = opr::TypeCvt::make(y2, dtype::Float32()); | |||
| SymbolVar y2_pad; | |||
| unpack_vector(gopt::GraphOptimizer{} | |||
| .add_pass<gopt::PaddingChannelPass>() | |||
| .apply({{y2}}) | |||
| .endpoint_vars(), | |||
| y2_pad); | |||
| ASSERT_EQ(y2_pad.node()->shape()[1], y2.node()->shape()[1]); | |||
| SmallVector<cg::OperatorNodeBase*> oprs; | |||
| auto cb = [&oprs](cg::OperatorNodeBase* opr) { | |||
| if (opr->same_type<opr::ConvBias>()) { | |||
| oprs.push_back(opr); | |||
| } | |||
| }; | |||
| cg::DepOprIter{cb}.add(y2_pad.node()->owner_opr()); | |||
| ASSERT_EQ(oprs.size(), 2); | |||
| ASSERT_EQ(oprs[0]->output(0)->shape()[1], 20); | |||
| ASSERT_EQ(oprs[1]->output(0)->shape()[1], 32); | |||
| HostTensorND t1, t2; | |||
| auto func1 = graph->compile({make_callback_copy(y2, t1)}); | |||
| func1->execute(); | |||
| auto func2 = graph->compile({make_callback_copy(y2_pad, t2)}); | |||
| func2->execute(); | |||
| MGB_ASSERT_TENSOR_EQ(t1, t2); | |||
| } | |||
| // FIXME replace cpu with gpu to enable gpu validation | |||
| TEST(TestGoptInference, PaddingChannelsWithPooling) { | |||
| REQUIRE_GPU(1); | |||
| auto cn = CompNode::load("cpu0"); | |||
| // cn.activate(); | |||
| // auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||
| // auto sm_ver = prop.major * 10 + prop.minor; | |||
| // if (sm_ver < 61) { | |||
| // printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||
| // "expected: %d)\n", | |||
| // sm_ver, 61); | |||
| // return; | |||
| // } | |||
| HostTensorGenerator<dtype::Int8> gen; | |||
| auto graph = ComputingGraph::make(); | |||
| graph->options().graph_opt_level = 0; | |||
| auto mkvar = [&](const char* name, const TensorShape& shp, | |||
| const DType& dtype) { | |||
| return opr::TypeCvt::make( | |||
| opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||
| dtype); | |||
| }; | |||
| auto mkcvar = [&](const char* name, const TensorShape& shp, | |||
| const DType& dtype) { | |||
| return opr::TypeCvt::make( | |||
| opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||
| .rename(name), | |||
| dtype); | |||
| }; | |||
| auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)), | |||
| w = mkcvar("w", {20, 3, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
| b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
| opr::ConvBias::Param param; | |||
| param.format = opr::ConvBias::Param::Format::NCHW; | |||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||
| param.stride_h = param.stride_w = 1; | |||
| param.pad_h = param.pad_w = 1; | |||
| auto y = opr::ConvBias::make(x, w, b, param, {}, | |||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
| auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
| b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
| auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | |||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
| opr::Pooling::Param pool_param; | |||
| pool_param.format = opr::Pooling::Param::Format::NCHW; | |||
| y1 = opr::Pooling::make(y1, pool_param); | |||
| y1 = opr::TypeCvt::make(y1, dtype::Float32()); | |||
| SymbolVar y1_pad; | |||
| unpack_vector(gopt::GraphOptimizer{} | |||
| .add_pass<gopt::PaddingChannelPass>() | |||
| .apply({{y1}}) | |||
| .endpoint_vars(), | |||
| y1_pad); | |||
| ASSERT_EQ(y1_pad.node()->shape()[1], y1.node()->shape()[1]); | |||
| SmallVector<cg::OperatorNodeBase*> oprs; | |||
| auto cb = [&oprs](cg::OperatorNodeBase* opr) { | |||
| if (opr->same_type<opr::Pooling>()) { | |||
| oprs.push_back(opr); | |||
| } | |||
| }; | |||
| cg::DepOprIter{cb}.add(y1_pad.node()->owner_opr()); | |||
| ASSERT_EQ(oprs[0]->output(0)->shape()[1], 32); | |||
| HostTensorND t1, t2; | |||
| auto func1 = graph->compile({make_callback_copy(y1, t1)}); | |||
| func1->execute(); | |||
| auto func2 = graph->compile({make_callback_copy(y1_pad, t2)}); | |||
| func2->execute(); | |||
| MGB_ASSERT_TENSOR_EQ(t1, t2); | |||
| } | |||
| // FIXME replace cpu with gpu to enable gpu validation | |||
| TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) { | |||
| REQUIRE_GPU(1); | |||
| auto cn = CompNode::load("cpu0"); | |||
| // cn.activate(); | |||
| // auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; | |||
| // auto sm_ver = prop.major * 10 + prop.minor; | |||
| // if (sm_ver < 61) { | |||
| // printf("This testcast ignored due to insufficient cuda cap(got: %d, " | |||
| // "expected: %d)\n", | |||
| // sm_ver, 61); | |||
| // return; | |||
| // } | |||
| HostTensorGenerator<dtype::Int8> gen; | |||
| auto graph = ComputingGraph::make(); | |||
| graph->options().graph_opt_level = 0; | |||
| auto mkvar = [&](const char* name, const TensorShape& shp, | |||
| const DType& dtype) { | |||
| return opr::TypeCvt::make( | |||
| opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), | |||
| dtype); | |||
| }; | |||
| auto mkcvar = [&](const char* name, const TensorShape& shp, | |||
| const DType& dtype) { | |||
| return opr::TypeCvt::make( | |||
| opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) | |||
| .rename(name), | |||
| dtype); | |||
| }; | |||
| std::shared_ptr<HostTensorND> mat = std::make_shared<HostTensorND>( | |||
| cn, TensorShape{16, 3, 3}, dtype::Float32()); | |||
| warp_perspective_mat_gen(*mat, 16, 14, 14); | |||
| auto mat_var = opr::Host2DeviceCopy::make(*graph, mat).rename("mat"); | |||
| auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)), | |||
| w = mkcvar("w", {20, 3, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
| b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
| opr::ConvBias::Param param; | |||
| param.format = opr::ConvBias::Param::Format::NCHW; | |||
| param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU; | |||
| param.stride_h = param.stride_w = 1; | |||
| param.pad_h = param.pad_w = 1; | |||
| auto y = opr::ConvBias::make(x, w, b, param, {}, | |||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
| auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)), | |||
| b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f)); | |||
| auto y1 = opr::ConvBias::make(y, w1, b1, param, {}, | |||
| OperatorNodeConfig{dtype::QuantizedS8(2.5f)}); | |||
| opr::WarpPerspective::Param warp_param; | |||
| warp_param.format = opr::WarpPerspective::Param::Format::NCHW; | |||
| y1 = opr::WarpPerspective::make(y1, mat_var, TensorShape{14, 14}, | |||
| warp_param); | |||
| y1 = opr::TypeCvt::make(y1, dtype::Float32()); | |||
| SymbolVar y1_pad; | |||
| unpack_vector(gopt::GraphOptimizer{} | |||
| .add_pass<gopt::PaddingChannelPass>() | |||
| .apply({{y1}}) | |||
| .endpoint_vars(), | |||
| y1_pad); | |||
| ASSERT_EQ(y1_pad.node()->shape()[1], y1.node()->shape()[1]); | |||
| SmallVector<cg::OperatorNodeBase*> oprs; | |||
| auto cb = [&oprs](cg::OperatorNodeBase* opr) { | |||
| if (opr->same_type<opr::WarpPerspective>()) { | |||
| oprs.push_back(opr); | |||
| } | |||
| }; | |||
| cg::DepOprIter{cb}.add(y1_pad.node()->owner_opr()); | |||
| ASSERT_EQ(oprs[0]->output(0)->shape()[1], 32); | |||
| HostTensorND t1, t2; | |||
| auto func1 = graph->compile({make_callback_copy(y1, t1)}); | |||
| func1->execute(); | |||
| auto func2 = graph->compile({make_callback_copy(y1_pad, t2)}); | |||
| func2->execute(); | |||
| MGB_ASSERT_TENSOR_EQ(t1, t2); | |||
| } | |||
| #endif | |||
| // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||