feat(imperative,mgb): add RegionRestrictedConvolution megbrain and imperative opr

GitOrigin-RevId: c0106ade08
3 years ago · 9e020d23b4
--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
@@ -91,6 +91,7 @@ __all__ = [
    "warp_affine",
    "warp_perspective",
    "pixel_shuffle",
    "region_restricted_conv",
 ]


@@ -1213,10 +1214,10 @@ def layer_norm(
 ):
    r"""Applies layer normalization to the input. Support tensor of any shape as input.
    Reference: https://arxiv.org/pdf/1803.08494.pdf.
    

    Args:
        inp: input tensor.
        normalized_shape: the shape that you want to be normalizated 
        normalized_shape: the shape that you want to be normalizated
        affine: whether to use weight and bias
        weight: must not be None when the affine is true
        bias: must not be None when the affine is true
@@ -1974,6 +1975,61 @@ def pixel_shuffle(inp: Tensor, upscale_factor: int) -> Tensor:
    return pixel_shuffle_cpp(inp, upscale_factor, _layerPixelShuffle_traceable)


 def region_restricted_conv(
    inp: Tensor,
    weight: Tensor,
    rin: Tensor,
    rout: Tensor,
    stride: Union[int, Tuple[int, int, int]] = 1,
    padding: Union[int, Tuple[int, int, int]] = 0,
    dilation: Union[int, Tuple[int, int, int]] = 1,
    groups: int = 1,
    conv_mode: str = "cross_correlation",
    compute_mode="default",
 ) -> Tensor:
    r"""Region Restricted convolution operation.

    Refer to :class:`~.RegionRestrictedConv` for more information.

    Args:
        inp: feature map of the convolution operation.
        weight: convolution kernel.
        stride: stride of the 2D region restricted convolution operation. Default: 1
        padding: size of the paddings added to the input on both sides of its
            spatial dimensions. Only zero-padding is supported. Default: 0
        dilation: dilation of the 2D convolution operation. Default: 1
        groups: number of groups into which the input and output channels are divided,
            so as to perform a ``grouped convolution``. When ``groups`` is not 1,
            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
            and the shape of weight should be ``(groups, out_channel // groups,
            in_channels // groups, depth, height, width)``. Default: 1
        conv_mode: supports "cross_correlation". Default: "cross_correlation"

    Returns:
        output tensor.
    """
    assert conv_mode.lower() == "cross_correlation"

    pad_h, pad_w = _expand_hw(padding)
    stride_h, stride_w = _expand_hw(stride)
    dilate_h, dilate_w = _expand_hw(dilation)

    sparse_type = "dense" if groups == 1 else "group"
    op = builtin.RegionRestrictedConvolution(
        stride_h=stride_h,
        stride_w=stride_w,
        pad_h=pad_h,
        pad_w=pad_w,
        dilate_h=dilate_h,
        dilate_w=dilate_w,
        mode=conv_mode,
        compute_mode=compute_mode,
        sparse=sparse_type,
    )
    (output,) = apply(op, inp, weight, rin, rout)
    return output


 from .quantized import conv_bias_activation  # isort:skip
 from .loss import *  # isort:skip
 from .vision import *  # isort:skip
--- a/imperative/python/megengine/module/init.py
+++ b/imperative/python/megengine/module/init.py
@@ -14,6 +14,7 @@ from .conv import (
    ConvTranspose3d,
    DeformableConv2d,
    LocalConv2d,
    RegionRestrictedConv,
 )
 from .conv_bn import ConvBn2d, ConvBnRelu2d
 from .deformable_psroi_pooling import DeformablePSROIPooling
--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
@@ -12,6 +12,7 @@ from ..functional import (
    deformable_conv2d,
    local_conv2d,
    pad,
    region_restricted_conv,
    relu,
 )
 from ..tensor import Parameter
@@ -982,3 +983,174 @@ class ConvTranspose3d(_ConvNd):
            self.output_padding,
            self.dilation,
        )


 class RegionRestrictedConv(_ConvNd):

    r"""Applies a 2D RegionRestricted Convolution over an input tensor.

    For instance, given an input of the size :math:`(N, C_{\text{in}}, H, W)`,
    this layer generates an output of the size
    :math:`(N, C_{\text{out}}, H_{\text{out}}, W_{\text{out}})` through the
    process described as below:

    .. math::
        \text{out}(N_i, C_{\text{out}_j}) =
        \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k)

    where :math:`\star` is the valid 2D cross-correlation operator,
    :math:`N` is batch size, :math:`C` denotes number of channels,
    :math:`H` is height of input planes in pixels, and :math:`W` is
    width in pixels.

    In general, output feature maps' shapes can be inferred as follows:

    input: :math:`(N, C_{\text{in}}, H_{\text{in}}, W_{\text{in}})`

    output: :math:`(N, C_{\text{out}}, H_{\text{out}}, W_{\text{out}})` where

    .. math::
        \text{H}_{out} = \lfloor \frac{\text{H}_{in} + 2 * \text{padding[0]} -
        \text{dilation[0]} * (\text{kernel_size[0]} - 1) - 1}{\text{stride[0]}} + 1 \rfloor

    .. math::
        \text{W}_{out} = \lfloor \frac{\text{W}_{in} + 2 * \text{padding[1]} -
        \text{dilation[1]} * (\text{kernel_size[1]} - 1) - 1}{\text{stride[1]}} + 1 \rfloor

    When `groups == in_channels` and `out_channels == K * in_channels`,
    where K is a positive integer, this operation is also known as depthwise
    convolution.

    In other words, for an input of size :math:`(N, C_{in}, H_{in}, W_{in})`,
    a depthwise convolution with a depthwise multiplier `K`, can be constructed
    by arguments :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.

    Args:
        in_channels: number of input channels.
        out_channels: number of output channels.
        kernel_size: size of weight on spatial dimensions. If kernel_size is
            an :class:`int`, the actual kernel size would be
            ``(kernel_size, kernel_size)``.
        stride: stride of the 2D convolution operation. Default: 1
        padding: size of the paddings added to the input on both sides of its
            spatial dimensions. Default: 0
        dilation: dilation of the 2D convolution operation. Default: 1
        groups: number of groups into which the input and output channels are divided,
            so as to perform a ``grouped convolution``. When ``groups`` is not 1,
            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
            and the shape of weight should be ``(groups, out_channel // groups,
            in_channels // groups, height, width)``. Default: 1
        conv_mode: Supports `cross_correlation`. Default: `cross_correlation`
        compute_mode: When set to "default", no special requirements will be
            placed on the precision of intermediate results. When set to "float32",
            "float32" would be used for accumulator and intermediate result, but only
            effective when input and output are of float16 dtype.
        padding_mode: "zeros", "reflect" or "replicate". Default: "zeros".
            Refer to :class:`~.module.padding.Pad` for more information.

    Note:
        * ``weight`` usually has shape ``(out_channels, in_channels, height, width)`` ,
            if groups is not 1, shape will be ``(groups, out_channels // groups, in_channels // groups, height, width)``

    Examples:
        >>> import numpy as np
        >>> import megengine as mge
        >>> import megengine.module as M
        >>> rrconv = M.RegionRestrictedConv(in_channels=2, out_channels=2, kernel_size=2, groups=2)
        >>> inp = mge.tensor(np.random.randn(1, 2, 2, 2).astype(np.float32))
        >>> rin = mge.tensor(np.random.randn(1, 2, 2).astype(np.int32))
        >>> rout = mge.tensor(np.random.randn(1, 1, 1).astype(np.int32))
        >>> oup = rrconv(inp, rin, rout)
        >>> oup.numpy().shape
        (1, 2, 1, 1)
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: Union[int, Tuple[int, int]],
        groups: int,
        stride: Union[int, Tuple[int, int]] = 1,
        padding: Union[int, Tuple[int, int]] = 0,
        dilation: Union[int, Tuple[int, int]] = 1,
        conv_mode: str = "cross_correlation",
        compute_mode: str = "default",
        padding_mode: str = "zeros",
        **kwargs
    ):
        kernel_size = _pair_nonzero(kernel_size)
        stride = _pair_nonzero(stride)
        padding = _pair(padding)
        dilation = _pair_nonzero(dilation)
        self.conv_mode = conv_mode
        self.compute_mode = compute_mode
        self.padding_mode = padding_mode
        super().__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride,
            padding,
            0,
            dilation,
            groups,
            False,
            **kwargs,
        )

    def _get_fanin(self):
        kh, kw = self.kernel_size
        ic = self.in_channels
        return kh * kw * ic

    def _infer_weight_shape(self):
        group = self.groups
        ichl = self.in_channels
        ochl = self.out_channels
        kh, kw = self.kernel_size
        if group == 1:
            # Assume format is NCHW
            return (ochl, ichl, kh, kw)

        assert (
            ichl % group == 0 and ochl % group == 0
        ), "invalid config: in_channels={} out_channels={} group={}".format(
            ichl, ochl, group
        )
        # Assume format is NCHW
        return (group, ochl // group, ichl // group, kh, kw)

    def _infer_bias_shape(self):
        # Assume format is NCHW
        return (1, self.out_channels, 1, 1)

    def get_pad_width(self):
        return (
            (0, 0),
            (0, 0),
            (self.padding[0], self.padding[0]),
            (self.padding[1], self.padding[1]),
        )

    def calc_conv(self, inp, weight, rin, rout):
        assert self.padding_mode in [
            "zeros",
            "reflect",
            "replicate",
        ]
        return region_restricted_conv(
            inp,
            weight,
            rin,
            rout,
            self.stride,
            self.padding,
            self.dilation,
            self.groups,
            self.conv_mode,
            self.compute_mode,
        )

    def forward(self, inp, rin, rout):
        return self.calc_conv(inp, self.weight, rin, rout)
--- a/imperative/python/test/unit/functional/test_functional.py
+++ b/imperative/python/test/unit/functional/test_functional.py
@@ -930,6 +930,179 @@ def test_batch_conv_bias():
    run(1, 4, 4, 5, 5, 3, 3, 0, 0, 1, 1, True)


 def test_region_restricted_conv_forward_backward_naive():
    import megengine as mge
    import megengine.module as M
    from megengine.autodiff import GradManager

    handle = "cpu0"
    src_1 = np.arange(8).reshape(1, 2, 2, 2).astype(np.float32)
    filter_1 = np.arange(8).reshape(2, 1, 1, 2, 2).astype(np.float32)
    rin_1 = np.array([1, 1, 1, 1]).reshape(1, 2, 2).astype(np.int32)
    rout_1 = np.array([1]).reshape(1, 1, 1).astype(np.int32)
    cpu_src = tensor(src_1, device=handle)
    cpu_filter = tensor(filter_1, device=handle)
    gm = GradManager().attach([cpu_src, cpu_filter])
    with gm:
        cpu_out = F.region_restricted_conv(
            cpu_src,
            cpu_filter,
            tensor(rin_1, device=handle),
            tensor(rout_1, device=handle),
            groups=2,
        )
        gm.backward(cpu_out, tensor(np.ones((1, 2, 1, 1)), device=handle))
    np.testing.assert_allclose(
        cpu_src.grad, np.array([0, 1, 2, 3, 4, 5, 6, 7]).reshape(1, 2, 2, 2)
    )
    np.testing.assert_allclose(
        cpu_filter.grad, np.array([0, 1, 2, 3, 4, 5, 6, 7]).reshape(2, 1, 1, 2, 2)
    )


@pytest.mark.skipif(
    not is_cuda_available(), reason="rrconv cuda kernel requires cuda available"
 )
 def test_region_restricted_conv_forward_backward_cuda():
    import megengine as mge
    import megengine.module as M
    from megengine.autodiff import GradManager
    import megengine.distributed as dist

    # params
    handle = "gpu0"
    N = 1
    GROUP = 3
    FH = FW = 2
    IH = IW = 2
    OH = OW = 1
    ICPG = OCPG = 1
    grad_shape = (N, GROUP * ICPG, IH, IW)
    src_shape = grad_shape
    filter_shape = (GROUP, OCPG, ICPG, FH, FW)
    diff_shape = (N, GROUP * OCPG, OH, OW)
    rin_shape = (N, IH, IW)
    rout_shape = (N, OH, OW)

    def reduce(shape):
        mul = 1
        for x in shape:
            mul *= x
        return mul

    def get_groundtruth():
        src = tensor(
            np.arange(reduce(src_shape)).reshape(src_shape).astype(np.float32),
            device="cpu0",
        )
        filter = tensor(np.ones(filter_shape).astype(np.float32), device="cpu0")
        rin = tensor(np.ones(rin_shape).astype(np.int32), device="cpu0")
        rout = tensor(np.ones(rout_shape).astype(np.int32), device="cpu0")
        gm = GradManager().attach([src, filter])
        with gm:
            expected_out = F.region_restricted_conv(
                src, filter, rin, rout, groups=GROUP
            )
            gm.backward(
                expected_out,
                tensor(np.ones(diff_shape, dtype=np.float32), device="cpu0"),
            )
        return src, filter

    expected_src, expected_filter = get_groundtruth()

    src = tensor(
        np.arange(reduce(src_shape)).reshape(src_shape).astype(np.float32),
        device=handle,
    )
    filter = tensor(np.ones(filter_shape).astype(np.float32), device=handle)
    rin = tensor(np.ones(rin_shape).astype(np.int32), device=handle)
    rout = tensor(np.ones(rout_shape).astype(np.int32), device=handle)
    gm = GradManager().attach([src, filter])
    with gm:
        gpu_out = F.region_restricted_conv(src, filter, rin, rout, groups=GROUP)
        gm.backward(gpu_out, tensor(np.ones(diff_shape), device=handle))
        np.testing.assert_allclose(src.grad, expected_src.grad)
        np.testing.assert_allclose(filter.grad, expected_filter.grad)


@pytest.mark.skipif(
    not is_cuda_available(), reason="rrconv cuda kernel requires cuda available"
 )
 def test_region_restricted_conv_forward_backward_uint8():
    import megengine as mge
    import megengine.module as M
    from megengine.autodiff import GradManager

    # params
    handle = "gpu0"
    N = 1
    GROUP = 2
    FH = FW = 1
    IH = IW = 4
    OH = OW = 4
    ICPG = OCPG = 1
    grad_shape = (N, GROUP * ICPG, IH, IW)
    src_shape = grad_shape
    filter_shape = (GROUP, OCPG, ICPG, FH, FW)
    diff_shape = (N, GROUP * OCPG, OH, OW)
    rin_shape = (N, IH, IW)
    rout_shape = (N, OH, OW)

    def reduce(shape):
        mul = 1
        for x in shape:
            mul *= x
        return mul

    def get_groundtruth():
        src = tensor(
            np.arange(reduce(src_shape)).reshape(src_shape).astype(np.float32),
            device="cpu0",
        )
        filter = tensor(np.ones(filter_shape).astype(np.float32), device="cpu0")
        rin = tensor(np.ones(rin_shape).astype(np.int32), device="cpu0")
        rout = tensor(np.ones(rout_shape).astype(np.int32), device="cpu0")
        gm = GradManager().attach([src, filter])
        with gm:
            expected_out = F.region_restricted_conv(
                src, filter, rin, rout, groups=GROUP
            )
            gm.backward(
                expected_out,
                tensor(np.ones(diff_shape, dtype=np.float32), device="cpu0"),
            )
        return src, filter

    expected_src, expected_filter = get_groundtruth()

    # forward and dgrad/wgrad
    src = tensor(
        np.arange(reduce(src_shape)).reshape(src_shape).astype(np.float32),
        device=handle,
    )
    filter = tensor(np.ones(filter_shape).astype(np.float32), device=handle)
    rin = tensor(np.ones(rin_shape).astype(np.uint8), device=handle)
    rout = tensor(np.ones(rout_shape).astype(np.uint8), device=handle)

    gm = GradManager().attach([src, filter])
    with gm:
        gpu_out = F.region_restricted_conv(src, filter, rin, rout, groups=GROUP)
        gm.backward(
            gpu_out, tensor(np.ones(diff_shape, dtype=np.float32), device=handle)
        )
        # assert uint8 gpu result close to cpu result
        np.testing.assert_allclose(src.grad, expected_src.grad)
        np.testing.assert_allclose(filter.grad, expected_filter.grad)


 def test_region_restricted_conv():
    test_region_restricted_conv_forward_backward_naive()
    if is_cuda_available():
        test_region_restricted_conv_forward_backward_cuda()
        test_region_restricted_conv_forward_backward_uint8()


 def test_conv2d_autocast():
    """check amp's result is equal to manually converted result"""
    amp.enabled = True
--- a/imperative/src/impl/ops/convolution.cpp
+++ b/imperative/src/impl/ops/convolution.cpp
@@ -3,9 +3,11 @@
 #include "../blob_manager_impl.h"
 #include "../dnn_op_helper.h"
 #include "../op_trait.h"
 #include "megbrain/common.h"
 #include "megbrain/imperative/ops/autogen.h"
 #include "megbrain/opr/internal/megdnn_opr_wrapper.h"
 #include "megbrain/opr/tensor_gen.h"
 #include "megdnn/oprs/nn.h"

 namespace mgb {
 namespace imperative {
@@ -356,5 +358,174 @@ OP_TRAIT_REG(Convolution3DBackwardData, Convolution3DBackwardData)
 }  // namespace convolution3d_backward_data
 }  // namespace

 namespace {
 namespace region_restricted_conv {
 std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* node_) {
    auto* node = &node_->cast_final_safe<opr::RegionRestrictedConvolution>();
    return RegionRestrictedConvolution::make(node->param());
 }

 auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
    auto&& conv = static_cast<const RegionRestrictedConvolution&>(def);
    OperatorNodeConfig config{conv.make_name()};
    return opr::RegionRestrictedConvolution::make(
            inputs[0], inputs[1], inputs[2], inputs[3], conv.param(), config);
 }

 std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
        const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) {
    auto&& region_restricted_conv =
            def.cast_final_safe<mgb::imperative::RegionRestrictedConvolution>();
    DnnOprHelper<megdnn::RegionRestrictedConvolutionForward> dnn_opr(
            region_restricted_conv.param());

    auto&& src = inputs[0].layout;
    auto&& filter = inputs[1].layout;
    auto&& rin = inputs[2].layout;
    auto&& rout = inputs[3].layout;
    TensorLayout output_layout{src.dtype};
    if (src.ndim && filter.ndim) {
        dnn_opr.opr().deduce_layout(src, filter, rin, rout, output_layout);
    }

    return {{{output_layout, inputs[0].comp_node}}, output_layout.ndim != 0};
 }

 SmallVector<TensorPtr> apply_on_physical_tensor(
        const OpDef& def, const SmallVector<TensorPtr>& inputs,
        SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
    // create megdnn opr
    auto&& region_restricted_conv = def.cast_final_safe<RegionRestrictedConvolution>();
    CompNode cn = inputs[0]->comp_node();

    auto&& param = region_restricted_conv.param();
    DnnOprCaller<megdnn::RegionRestrictedConvolutionForward> dnn_opr(cn, param);

    auto srclo = inputs[0]->layout();
    auto filterlo = inputs[1]->layout();
    auto rinlo = inputs[2]->layout();
    auto routlo = inputs[3]->layout();

    auto out_layout = [&] {
        if (validated) {
            return output_descs[0].layout;
        } else {
            TensorLayout out_layout{inputs[0]->dtype()};
            dnn_opr.op()->deduce_layout(srclo, filterlo, rinlo, routlo, out_layout);
            return out_layout;
        }
    }();

    auto out = Tensor::make(out_layout, cn);
    dnn_opr.exec_with_ws(inputs[0], inputs[1], inputs[2], inputs[3], out);
    return {out};
 }

 OP_TRAIT_REG(
        RegionRestrictedConvolution, RegionRestrictedConvolution,
        opr::RegionRestrictedConvolution)
        .make_from_op_node(make_from_op_node)
        .apply_on_var_node(apply_on_var_node)
        .infer_output_attrs_fallible(infer_output_attrs_fallible)
        .apply_on_physical_tensor(apply_on_physical_tensor)
        .fallback();
 }  // namespace region_restricted_conv
 }  // namespace

 namespace {
 namespace region_restricted_conv_backward_data {

 std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* node_) {
    auto* node =
            &node_->cast_final_safe<opr::RegionRestrictedConvolutionBackwardData>();
    return RegionRestrictedConvolutionBackwardData::make(node->param());
 }

 auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
    auto&& conv = static_cast<const RegionRestrictedConvolutionBackwardData&>(def);
    OperatorNodeConfig config{conv.make_name()};
    // output_dtype may infered from input within rrconv bwd data(deduce_dtype api)
    CompNode cn = inputs[0]->comp_node();
    DType output_dtype;
    DnnOprCaller<megdnn::RegionRestrictedConvolutionBackwardData> dnn_opr(cn);
    dnn_opr.op()->deduce_dtype(
            inputs[0]->dtype(), inputs[1]->dtype(), inputs[2]->dtype(),
            inputs[3]->dtype(), output_dtype);
    if (output_dtype.valid())
        config.output_dtype(output_dtype);
    if (inputs.size() == 4) {
        return opr::RegionRestrictedConvolutionBackwardData::make(
                inputs[0], inputs[1], inputs[2], inputs[3], conv.param(), config);
    } else if (inputs.size() == 5) {
        return opr::RegionRestrictedConvolutionBackwardData::make(
                inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], conv.param(),
                config);
    }
    mgb_assert(0);
 }

 std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
        const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) {
    auto&& convbwd = def.cast_final_safe<
            mgb::imperative::RegionRestrictedConvolutionBackwardData>();
    DnnOprHelper<megdnn::RegionRestrictedConvolutionBackwardData> dnn_opr(
            convbwd.param());

    TensorLayout filter = inputs[0].layout;
    TensorLayout diff = inputs[1].layout;
    TensorLayout rin = inputs[2].layout;
    TensorLayout rout = inputs[3].layout;

    DType output_dtype;
    dnn_opr.opr().deduce_dtype(
            inputs[0].layout.dtype, inputs[1].layout.dtype, inputs[2].layout.dtype,
            inputs[3].layout.dtype, output_dtype);
    TensorLayout output_layout{output_dtype};
    if (diff.ndim && filter.ndim) {
        dnn_opr.opr().deduce_layout(filter, diff, rin, rout, output_layout);
    }
    return {{{output_layout, inputs[0].comp_node}}, output_layout.ndim != 0};
 }

 SmallVector<TensorPtr> apply_on_physical_tensor(
        const OpDef& def, const SmallVector<TensorPtr>& inputs,
        SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
    auto&& convbwd = def.cast_final_safe<RegionRestrictedConvolutionBackwardData>();
    CompNode cn = inputs[0]->comp_node();
    DnnOprCaller<megdnn::RegionRestrictedConvolutionBackwardData> dnn_opr(
            cn, convbwd.param());

    auto filterlo = inputs[0]->layout();
    auto difflo = inputs[1]->layout();
    auto rinlo = inputs[2]->layout();
    auto routlo = inputs[3]->layout();

    auto out_layout = [&] {
        if (validated) {
            return output_descs[0].layout;
        } else {
            TensorLayout out_layout{inputs[0]->dtype()};
            dnn_opr.op()->deduce_layout(filterlo, difflo, rinlo, routlo, out_layout);
            return out_layout;
        }
    }();

    auto out = Tensor::make(out_layout, cn);
    dnn_opr.exec_with_ws(inputs[0], inputs[1], inputs[2], inputs[3], out);
    return {out};
 }

 OP_TRAIT_REG(
        RegionRestrictedConvolutionBackwardData,
        RegionRestrictedConvolutionBackwardData,
        opr::RegionRestrictedConvolutionBackwardData)
        .make_from_op_node(make_from_op_node)
        .apply_on_var_node(apply_on_var_node)
        .infer_output_attrs_fallible(infer_output_attrs_fallible)
        .apply_on_physical_tensor(apply_on_physical_tensor)
        .fallback();
 }  // namespace region_restricted_conv_backward_data
 }  // namespace

 }  // namespace imperative
 }  // namespace mgb
--- a/imperative/tablegen/generated/hash.txt
+++ b/imperative/tablegen/generated/hash.txt
@@ -1,7 +1,7 @@
 905bdf78e5413b06873be64b4ba55db9  ../../dnn/scripts/opr_param_defs.py
 40708c56b1f05fdb7d06cc097a300330  ../../src/core/include/megbrain/ir/ops.td
 9f3af118c7fe8d0c9db433825d5ad77b  generated/opdef.h.inl
 4041e44a8ba3cca3b3affa1ed9ed44a2  generated/opdef.cpp.inl
 319e1d170c989fe793a4e9c45decefc4  generated/opdef.py.inl
 26a18a7593566128ecce76e8f74dcc5d  generated/opdef.cpy.inl
 da03ffe2a15411f902cd88920d3d47ec  ../../src/core/include/megbrain/ir/ops.td
 5756619f37e4dc130e1b049d7706d4eb  generated/opdef.h.inl
 98d1291eed73970ee087f898b6241358  generated/opdef.cpp.inl
 b1a9c7569392942294c2168d40939eb5  generated/opdef.py.inl
 3d88d5358d15a39219957f5257e32f5b  generated/opdef.cpy.inl
 71e1462bf4d882e2615c3c632cb671cc  generated/enum_macro.h
--- a/imperative/tablegen/generated/opdef.cpp.inl
+++ b/imperative/tablegen/generated/opdef.cpp.inl
@@ -5694,6 +5694,310 @@ OP_TRAIT_REG(Reduce, Reduce)
    .props(Reduce_props_impl)
    .make_name(Reduce_make_name_impl);

 MGB_DYN_TYPE_OBJ_FINAL_IMPL(RegionRestrictedConvolution);

 namespace {
 size_t RegionRestrictedConvolution_hash_impl(const OpDef& def_) {
    auto&& op_ = def_.cast_final_safe<RegionRestrictedConvolution>();
    static_cast<void>(op_);
    size_t val = mgb::hash(op_.dyn_typeinfo());
    val = mgb::hash_pair_combine(val, mgb::enumhash()(op_.mode));
    val = mgb::hash_pair_combine(val, mgb::hash(op_.pad_h));
    val = mgb::hash_pair_combine(val, mgb::hash(op_.pad_w));
    val = mgb::hash_pair_combine(val, mgb::hash(op_.stride_h));
    val = mgb::hash_pair_combine(val, mgb::hash(op_.stride_w));
    val = mgb::hash_pair_combine(val, mgb::hash(op_.dilate_h));
    val = mgb::hash_pair_combine(val, mgb::hash(op_.dilate_w));
    val = mgb::hash_pair_combine(val, mgb::enumhash()(op_.sparse));
    val = mgb::hash_pair_combine(val, mgb::enumhash()(op_.format));
    val = mgb::hash_pair_combine(val, mgb::enumhash()(op_.compute_mode));
    return val;
 }
 bool RegionRestrictedConvolution_is_same_st_impl(const OpDef& lhs_, const OpDef& rhs_) {
    auto &&a_ = lhs_.cast_final_safe<RegionRestrictedConvolution>(),
         &&b_ = rhs_.cast_final_safe<RegionRestrictedConvolution>();
    static_cast<void>(a_);
    static_cast<void>(b_);
    if (a_.mode != b_.mode) return false;
    if (a_.pad_h != b_.pad_h) return false;
    if (a_.pad_w != b_.pad_w) return false;
    if (a_.stride_h != b_.stride_h) return false;
    if (a_.stride_w != b_.stride_w) return false;
    if (a_.dilate_h != b_.dilate_h) return false;
    if (a_.dilate_w != b_.dilate_w) return false;
    if (a_.sparse != b_.sparse) return false;
    if (a_.format != b_.format) return false;
    if (a_.compute_mode != b_.compute_mode) return false;
    return true;
 }
 std::vector<std::pair<const char*, std::string>> RegionRestrictedConvolution_props_impl(const OpDef& def_) {
    auto&& op_ = def_.cast_final_safe<RegionRestrictedConvolution>();
    static_cast<void>(op_);
    std::vector<std::pair<const char*, std::string>> props_;
    switch (op_.mode){
    case RegionRestrictedConvolution::Mode::CROSS_CORRELATION:
        props_.emplace_back("mode", "CROSS_CORRELATION");
        break;
    case RegionRestrictedConvolution::Mode::CONVOLUTION:
        props_.emplace_back("mode", "CONVOLUTION");
        break;
    default:
        props_.emplace_back("mode", "INVALID");
        break;
    }
    props_.emplace_back("pad_h", std::to_string(op_.pad_h));
    props_.emplace_back("pad_w", std::to_string(op_.pad_w));
    props_.emplace_back("stride_h", std::to_string(op_.stride_h));
    props_.emplace_back("stride_w", std::to_string(op_.stride_w));
    props_.emplace_back("dilate_h", std::to_string(op_.dilate_h));
    props_.emplace_back("dilate_w", std::to_string(op_.dilate_w));
    switch (op_.sparse){
    case RegionRestrictedConvolution::Sparse::DENSE:
        props_.emplace_back("sparse", "DENSE");
        break;
    case RegionRestrictedConvolution::Sparse::GROUP:
        props_.emplace_back("sparse", "GROUP");
        break;
    default:
        props_.emplace_back("sparse", "INVALID");
        break;
    }
    switch (op_.format){
    case RegionRestrictedConvolution::Format::NCHW:
        props_.emplace_back("format", "NCHW");
        break;
    case RegionRestrictedConvolution::Format::NHWC:
        props_.emplace_back("format", "NHWC");
        break;
    case RegionRestrictedConvolution::Format::NHWCD4:
        props_.emplace_back("format", "NHWCD4");
        break;
    case RegionRestrictedConvolution::Format::NCHW4:
        props_.emplace_back("format", "NCHW4");
        break;
    case RegionRestrictedConvolution::Format::NCHW8:
        props_.emplace_back("format", "NCHW8");
        break;
    case RegionRestrictedConvolution::Format::NCHW32:
        props_.emplace_back("format", "NCHW32");
        break;
    case RegionRestrictedConvolution::Format::NCHW88:
        props_.emplace_back("format", "NCHW88");
        break;
    case RegionRestrictedConvolution::Format::NCHW44:
        props_.emplace_back("format", "NCHW44");
        break;
    case RegionRestrictedConvolution::Format::NCHW44_DOT:
        props_.emplace_back("format", "NCHW44_DOT");
        break;
    case RegionRestrictedConvolution::Format::NCHW4_NCHW32:
        props_.emplace_back("format", "NCHW4_NCHW32");
        break;
    case RegionRestrictedConvolution::Format::NCHW32_NCHW4:
        props_.emplace_back("format", "NCHW32_NCHW4");
        break;
    case RegionRestrictedConvolution::Format::NCHW4_NCHW:
        props_.emplace_back("format", "NCHW4_NCHW");
        break;
    case RegionRestrictedConvolution::Format::NHWC_NCHW:
        props_.emplace_back("format", "NHWC_NCHW");
        break;
    case RegionRestrictedConvolution::Format::NHWC_NCHW4_IC_SMALL:
        props_.emplace_back("format", "NHWC_NCHW4_IC_SMALL");
        break;
    case RegionRestrictedConvolution::Format::NCHW_NCHW4_IC_SMALL:
        props_.emplace_back("format", "NCHW_NCHW4_IC_SMALL");
        break;
    case RegionRestrictedConvolution::Format::CHWN4:
        props_.emplace_back("format", "CHWN4");
        break;
    case RegionRestrictedConvolution::Format::NCHW64:
        props_.emplace_back("format", "NCHW64");
        break;
    case RegionRestrictedConvolution::Format::NCHW4_NHWC:
        props_.emplace_back("format", "NCHW4_NHWC");
        break;
    default:
        props_.emplace_back("format", "INVALID");
        break;
    }
    switch (op_.compute_mode){
    case RegionRestrictedConvolution::ComputeMode::DEFAULT:
        props_.emplace_back("compute_mode", "DEFAULT");
        break;
    case RegionRestrictedConvolution::ComputeMode::FLOAT32:
        props_.emplace_back("compute_mode", "FLOAT32");
        break;
    default:
        props_.emplace_back("compute_mode", "INVALID");
        break;
    }
    return props_;
 }
 std::string RegionRestrictedConvolution_make_name_impl(const OpDef& def_) {
    auto&& op_ = def_.cast_final_safe<RegionRestrictedConvolution>();
    static_cast<void>(op_);
    return "RegionRestrictedConvolution";
 }
 } // anonymous namespace
 OP_TRAIT_REG(RegionRestrictedConvolution, RegionRestrictedConvolution)
    .hash(RegionRestrictedConvolution_hash_impl)
    .is_same_st(RegionRestrictedConvolution_is_same_st_impl)
    .props(RegionRestrictedConvolution_props_impl)
    .make_name(RegionRestrictedConvolution_make_name_impl);

 MGB_DYN_TYPE_OBJ_FINAL_IMPL(RegionRestrictedConvolutionBackwardData);

 namespace {
 size_t RegionRestrictedConvolutionBackwardData_hash_impl(const OpDef& def_) {
    auto&& op_ = def_.cast_final_safe<RegionRestrictedConvolutionBackwardData>();
    static_cast<void>(op_);
    size_t val = mgb::hash(op_.dyn_typeinfo());
    val = mgb::hash_pair_combine(val, mgb::enumhash()(op_.mode));
    val = mgb::hash_pair_combine(val, mgb::hash(op_.pad_h));
    val = mgb::hash_pair_combine(val, mgb::hash(op_.pad_w));
    val = mgb::hash_pair_combine(val, mgb::hash(op_.stride_h));
    val = mgb::hash_pair_combine(val, mgb::hash(op_.stride_w));
    val = mgb::hash_pair_combine(val, mgb::hash(op_.dilate_h));
    val = mgb::hash_pair_combine(val, mgb::hash(op_.dilate_w));
    val = mgb::hash_pair_combine(val, mgb::enumhash()(op_.sparse));
    val = mgb::hash_pair_combine(val, mgb::enumhash()(op_.format));
    val = mgb::hash_pair_combine(val, mgb::enumhash()(op_.compute_mode));
    return val;
 }
 bool RegionRestrictedConvolutionBackwardData_is_same_st_impl(const OpDef& lhs_, const OpDef& rhs_) {
    auto &&a_ = lhs_.cast_final_safe<RegionRestrictedConvolutionBackwardData>(),
         &&b_ = rhs_.cast_final_safe<RegionRestrictedConvolutionBackwardData>();
    static_cast<void>(a_);
    static_cast<void>(b_);
    if (a_.mode != b_.mode) return false;
    if (a_.pad_h != b_.pad_h) return false;
    if (a_.pad_w != b_.pad_w) return false;
    if (a_.stride_h != b_.stride_h) return false;
    if (a_.stride_w != b_.stride_w) return false;
    if (a_.dilate_h != b_.dilate_h) return false;
    if (a_.dilate_w != b_.dilate_w) return false;
    if (a_.sparse != b_.sparse) return false;
    if (a_.format != b_.format) return false;
    if (a_.compute_mode != b_.compute_mode) return false;
    return true;
 }
 std::vector<std::pair<const char*, std::string>> RegionRestrictedConvolutionBackwardData_props_impl(const OpDef& def_) {
    auto&& op_ = def_.cast_final_safe<RegionRestrictedConvolutionBackwardData>();
    static_cast<void>(op_);
    std::vector<std::pair<const char*, std::string>> props_;
    switch (op_.mode){
    case RegionRestrictedConvolutionBackwardData::Mode::CROSS_CORRELATION:
        props_.emplace_back("mode", "CROSS_CORRELATION");
        break;
    case RegionRestrictedConvolutionBackwardData::Mode::CONVOLUTION:
        props_.emplace_back("mode", "CONVOLUTION");
        break;
    default:
        props_.emplace_back("mode", "INVALID");
        break;
    }
    props_.emplace_back("pad_h", std::to_string(op_.pad_h));
    props_.emplace_back("pad_w", std::to_string(op_.pad_w));
    props_.emplace_back("stride_h", std::to_string(op_.stride_h));
    props_.emplace_back("stride_w", std::to_string(op_.stride_w));
    props_.emplace_back("dilate_h", std::to_string(op_.dilate_h));
    props_.emplace_back("dilate_w", std::to_string(op_.dilate_w));
    switch (op_.sparse){
    case RegionRestrictedConvolutionBackwardData::Sparse::DENSE:
        props_.emplace_back("sparse", "DENSE");
        break;
    case RegionRestrictedConvolutionBackwardData::Sparse::GROUP:
        props_.emplace_back("sparse", "GROUP");
        break;
    default:
        props_.emplace_back("sparse", "INVALID");
        break;
    }
    switch (op_.format){
    case RegionRestrictedConvolutionBackwardData::Format::NCHW:
        props_.emplace_back("format", "NCHW");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::NHWC:
        props_.emplace_back("format", "NHWC");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::NHWCD4:
        props_.emplace_back("format", "NHWCD4");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::NCHW4:
        props_.emplace_back("format", "NCHW4");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::NCHW8:
        props_.emplace_back("format", "NCHW8");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::NCHW32:
        props_.emplace_back("format", "NCHW32");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::NCHW88:
        props_.emplace_back("format", "NCHW88");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::NCHW44:
        props_.emplace_back("format", "NCHW44");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::NCHW44_DOT:
        props_.emplace_back("format", "NCHW44_DOT");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::NCHW4_NCHW32:
        props_.emplace_back("format", "NCHW4_NCHW32");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::NCHW32_NCHW4:
        props_.emplace_back("format", "NCHW32_NCHW4");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::NCHW4_NCHW:
        props_.emplace_back("format", "NCHW4_NCHW");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::NHWC_NCHW:
        props_.emplace_back("format", "NHWC_NCHW");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::NHWC_NCHW4_IC_SMALL:
        props_.emplace_back("format", "NHWC_NCHW4_IC_SMALL");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::NCHW_NCHW4_IC_SMALL:
        props_.emplace_back("format", "NCHW_NCHW4_IC_SMALL");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::CHWN4:
        props_.emplace_back("format", "CHWN4");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::NCHW64:
        props_.emplace_back("format", "NCHW64");
        break;
    case RegionRestrictedConvolutionBackwardData::Format::NCHW4_NHWC:
        props_.emplace_back("format", "NCHW4_NHWC");
        break;
    default:
        props_.emplace_back("format", "INVALID");
        break;
    }
    switch (op_.compute_mode){
    case RegionRestrictedConvolutionBackwardData::ComputeMode::DEFAULT:
        props_.emplace_back("compute_mode", "DEFAULT");
        break;
    case RegionRestrictedConvolutionBackwardData::ComputeMode::FLOAT32:
        props_.emplace_back("compute_mode", "FLOAT32");
        break;
    default:
        props_.emplace_back("compute_mode", "INVALID");
        break;
    }
    return props_;
 }
 std::string RegionRestrictedConvolutionBackwardData_make_name_impl(const OpDef& def_) {
    auto&& op_ = def_.cast_final_safe<RegionRestrictedConvolutionBackwardData>();
    static_cast<void>(op_);
    return "RegionRestrictedConvolutionBackwardData";
 }
 } // anonymous namespace
 OP_TRAIT_REG(RegionRestrictedConvolutionBackwardData, RegionRestrictedConvolutionBackwardData)
    .hash(RegionRestrictedConvolutionBackwardData_hash_impl)
    .is_same_st(RegionRestrictedConvolutionBackwardData_is_same_st_impl)
    .props(RegionRestrictedConvolutionBackwardData_props_impl)
    .make_name(RegionRestrictedConvolutionBackwardData_make_name_impl);

 MGB_DYN_TYPE_OBJ_FINAL_IMPL(Remap);

 namespace {
--- a/imperative/tablegen/generated/opdef.cpy.inl
+++ b/imperative/tablegen/generated/opdef.cpy.inl
@@ -15368,6 +15368,580 @@ void _init_py_Reduce(py::module m) {
    mgb_assert(PyOp(OpDef)::ctype2pytype.emplace(Reduce::typeinfo(), &py_type).second);
 }

 void _init_py_RegionRestrictedConvolution_Mode(PyTypeObject& py_type) {
    auto& e_type = EnumWrapper<RegionRestrictedConvolution::Mode>::type;

    Py_INCREF(e_type);
    mgb_assert(PyDict_SetItemString(
        py_type.tp_dict, "Mode", reinterpret_cast<PyObject*>(e_type)) >= 0);
 }

 void _init_py_RegionRestrictedConvolution_Sparse(PyTypeObject& py_type) {
    auto& e_type = EnumWrapper<RegionRestrictedConvolution::Sparse>::type;

    Py_INCREF(e_type);
    mgb_assert(PyDict_SetItemString(
        py_type.tp_dict, "Sparse", reinterpret_cast<PyObject*>(e_type)) >= 0);
 }

 void _init_py_RegionRestrictedConvolution_Format(PyTypeObject& py_type) {
    auto& e_type = EnumWrapper<RegionRestrictedConvolution::Format>::type;

    Py_INCREF(e_type);
    mgb_assert(PyDict_SetItemString(
        py_type.tp_dict, "Format", reinterpret_cast<PyObject*>(e_type)) >= 0);
 }

 void _init_py_RegionRestrictedConvolution_ComputeMode(PyTypeObject& py_type) {
    auto& e_type = EnumWrapper<RegionRestrictedConvolution::ComputeMode>::type;

    Py_INCREF(e_type);
    mgb_assert(PyDict_SetItemString(
        py_type.tp_dict, "ComputeMode", reinterpret_cast<PyObject*>(e_type)) >= 0);
 }

 PyOpDefBegin(RegionRestrictedConvolution) // {
    static PyGetSetDef py_getsetters[];
    static PyMethodDef tp_methods[];
    
    static PyObject* getstate(PyObject* self, PyObject*) {
        auto& opdef = reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst();
        static_cast<void>(opdef);
        std::unordered_map<std::string, py::object> state {
            
            {"mode", serialization<decltype(opdef.mode)>::dump(opdef.mode)},
            {"pad_h", serialization<decltype(opdef.pad_h)>::dump(opdef.pad_h)},
            {"pad_w", serialization<decltype(opdef.pad_w)>::dump(opdef.pad_w)},
            {"stride_h", serialization<decltype(opdef.stride_h)>::dump(opdef.stride_h)},
            {"stride_w", serialization<decltype(opdef.stride_w)>::dump(opdef.stride_w)},
            {"dilate_h", serialization<decltype(opdef.dilate_h)>::dump(opdef.dilate_h)},
            {"dilate_w", serialization<decltype(opdef.dilate_w)>::dump(opdef.dilate_w)},
            {"sparse", serialization<decltype(opdef.sparse)>::dump(opdef.sparse)},
            {"format", serialization<decltype(opdef.format)>::dump(opdef.format)},
            {"compute_mode", serialization<decltype(opdef.compute_mode)>::dump(opdef.compute_mode)}
        };
        return py::cast(state).release().ptr();
    }
    static PyObject* setstate(PyObject* self, PyObject* args) {
        PyObject* dict = PyTuple_GetItem(args, 0);
        if (!dict) return NULL;
        auto state = py::cast<std::unordered_map<std::string, py::object>>(dict);
        auto& opdef = reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst();
        static_cast<void>(opdef);
        
        {
        auto&& iter = state.find("mode");
        if (iter != state.end()) {
            opdef.mode = serialization<decltype(opdef.mode)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("pad_h");
        if (iter != state.end()) {
            opdef.pad_h = serialization<decltype(opdef.pad_h)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("pad_w");
        if (iter != state.end()) {
            opdef.pad_w = serialization<decltype(opdef.pad_w)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("stride_h");
        if (iter != state.end()) {
            opdef.stride_h = serialization<decltype(opdef.stride_h)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("stride_w");
        if (iter != state.end()) {
            opdef.stride_w = serialization<decltype(opdef.stride_w)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("dilate_h");
        if (iter != state.end()) {
            opdef.dilate_h = serialization<decltype(opdef.dilate_h)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("dilate_w");
        if (iter != state.end()) {
            opdef.dilate_w = serialization<decltype(opdef.dilate_w)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("sparse");
        if (iter != state.end()) {
            opdef.sparse = serialization<decltype(opdef.sparse)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("format");
        if (iter != state.end()) {
            opdef.format = serialization<decltype(opdef.format)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("compute_mode");
        if (iter != state.end()) {
            opdef.compute_mode = serialization<decltype(opdef.compute_mode)>::load(iter->second);
        }
        }
        Py_RETURN_NONE;
    }
    static int py_init(PyObject *self, PyObject *args, PyObject *kwds);
 // };
 PyOpDefEnd(RegionRestrictedConvolution)

 int PyOp(RegionRestrictedConvolution)::py_init(PyObject *self, PyObject *args, PyObject *kwds) {
    static const char* kwlist[] = {"mode", "pad_h", "pad_w", "stride_h", "stride_w", "dilate_h", "dilate_w", "sparse", "format", "compute_mode", "scope", NULL};
    PyObject *mode = NULL, *pad_h = NULL, *pad_w = NULL, *stride_h = NULL, *stride_w = NULL, *dilate_h = NULL, *dilate_w = NULL, *sparse = NULL, *format = NULL, *compute_mode = NULL, *scope = NULL;
    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OOOOOOOOOOO", const_cast<char**>(kwlist), &mode, &pad_h, &pad_w, &stride_h, &stride_w, &dilate_h, &dilate_w, &sparse, &format, &compute_mode, &scope))
    return -1;

    if (mode) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().mode =
                    py::cast<decltype(RegionRestrictedConvolution::mode)>(py::handle(mode));
        } CATCH_ALL(-1)
    }

    if (pad_h) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().pad_h =
                    py::cast<decltype(RegionRestrictedConvolution::pad_h)>(py::handle(pad_h));
        } CATCH_ALL(-1)
    }

    if (pad_w) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().pad_w =
                    py::cast<decltype(RegionRestrictedConvolution::pad_w)>(py::handle(pad_w));
        } CATCH_ALL(-1)
    }

    if (stride_h) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().stride_h =
                    py::cast<decltype(RegionRestrictedConvolution::stride_h)>(py::handle(stride_h));
        } CATCH_ALL(-1)
    }

    if (stride_w) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().stride_w =
                    py::cast<decltype(RegionRestrictedConvolution::stride_w)>(py::handle(stride_w));
        } CATCH_ALL(-1)
    }

    if (dilate_h) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().dilate_h =
                    py::cast<decltype(RegionRestrictedConvolution::dilate_h)>(py::handle(dilate_h));
        } CATCH_ALL(-1)
    }

    if (dilate_w) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().dilate_w =
                    py::cast<decltype(RegionRestrictedConvolution::dilate_w)>(py::handle(dilate_w));
        } CATCH_ALL(-1)
    }

    if (sparse) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().sparse =
                    py::cast<decltype(RegionRestrictedConvolution::sparse)>(py::handle(sparse));
        } CATCH_ALL(-1)
    }

    if (format) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().format =
                    py::cast<decltype(RegionRestrictedConvolution::format)>(py::handle(format));
        } CATCH_ALL(-1)
    }

    if (compute_mode) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolution)*>(self)->inst().compute_mode =
                    py::cast<decltype(RegionRestrictedConvolution::compute_mode)>(py::handle(compute_mode));
        } CATCH_ALL(-1)
    }

    if (scope) {
        try {
            reinterpret_cast<PyOp(OpDef)*>(self)->op
                ->set_scope(py::cast<std::string>(py::handle(scope)));
        } CATCH_ALL(-1)
    }

    return 0;
 }

 PyGetSetDef PyOp(RegionRestrictedConvolution)::py_getsetters[] = {
    {const_cast<char*>("mode"), py_get_generic(RegionRestrictedConvolution, mode), py_set_generic(RegionRestrictedConvolution, mode), const_cast<char*>("mode"), NULL},
    {const_cast<char*>("pad_h"), py_get_generic(RegionRestrictedConvolution, pad_h), py_set_generic(RegionRestrictedConvolution, pad_h), const_cast<char*>("pad_h"), NULL},
    {const_cast<char*>("pad_w"), py_get_generic(RegionRestrictedConvolution, pad_w), py_set_generic(RegionRestrictedConvolution, pad_w), const_cast<char*>("pad_w"), NULL},
    {const_cast<char*>("stride_h"), py_get_generic(RegionRestrictedConvolution, stride_h), py_set_generic(RegionRestrictedConvolution, stride_h), const_cast<char*>("stride_h"), NULL},
    {const_cast<char*>("stride_w"), py_get_generic(RegionRestrictedConvolution, stride_w), py_set_generic(RegionRestrictedConvolution, stride_w), const_cast<char*>("stride_w"), NULL},
    {const_cast<char*>("dilate_h"), py_get_generic(RegionRestrictedConvolution, dilate_h), py_set_generic(RegionRestrictedConvolution, dilate_h), const_cast<char*>("dilate_h"), NULL},
    {const_cast<char*>("dilate_w"), py_get_generic(RegionRestrictedConvolution, dilate_w), py_set_generic(RegionRestrictedConvolution, dilate_w), const_cast<char*>("dilate_w"), NULL},
    {const_cast<char*>("sparse"), py_get_generic(RegionRestrictedConvolution, sparse), py_set_generic(RegionRestrictedConvolution, sparse), const_cast<char*>("sparse"), NULL},
    {const_cast<char*>("format"), py_get_generic(RegionRestrictedConvolution, format), py_set_generic(RegionRestrictedConvolution, format), const_cast<char*>("format"), NULL},
    {const_cast<char*>("compute_mode"), py_get_generic(RegionRestrictedConvolution, compute_mode), py_set_generic(RegionRestrictedConvolution, compute_mode), const_cast<char*>("compute_mode"), NULL},
    {NULL}  /* Sentinel */
 };

    PyMethodDef PyOp(RegionRestrictedConvolution)::tp_methods[] = {
        {const_cast<char*>("__getstate__"), PyOp(RegionRestrictedConvolution)::getstate, METH_NOARGS, "RegionRestrictedConvolution getstate"},
    {const_cast<char*>("__setstate__"), PyOp(RegionRestrictedConvolution)::setstate, METH_VARARGS, "RegionRestrictedConvolution setstate"},
        {NULL}  /* Sentinel */
    };
    
 void _init_py_RegionRestrictedConvolution(py::module m) {
    using py_op = PyOp(RegionRestrictedConvolution);
    auto& py_type = PyOpType(RegionRestrictedConvolution);
    py_type = {PyVarObject_HEAD_INIT(NULL, 0)};
    py_type.tp_name = "megengine.core._imperative_rt.ops.RegionRestrictedConvolution";
    py_type.tp_basicsize = sizeof(PyOp(RegionRestrictedConvolution));
    py_type.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
    py_type.tp_doc = "RegionRestrictedConvolution";
    py_type.tp_base = &PyOpType(OpDef);
    py_type.tp_dealloc = py_dealloc_generic<py_op>;
    py_type.tp_new = py_new_generic<py_op>;
    py_type.tp_init = py_op::py_init;
    py_type.tp_methods = py_op::tp_methods;
    py_type.tp_getset = py_op::py_getsetters;
    mgb_assert(PyType_Ready(&py_type) >= 0);
        _init_py_RegionRestrictedConvolution_Mode(py_type);
    _init_py_RegionRestrictedConvolution_Sparse(py_type);
    _init_py_RegionRestrictedConvolution_Format(py_type);
    _init_py_RegionRestrictedConvolution_ComputeMode(py_type);

    PyType_Modified(&py_type);
    m.add_object("RegionRestrictedConvolution", reinterpret_cast<PyObject*>(&py_type));
    mgb_assert(PyOp(OpDef)::ctype2pytype.emplace(RegionRestrictedConvolution::typeinfo(), &py_type).second);
 }

 void _init_py_RegionRestrictedConvolutionBackwardData_Mode(PyTypeObject& py_type) {
    auto& e_type = EnumWrapper<RegionRestrictedConvolutionBackwardData::Mode>::type;

    Py_INCREF(e_type);
    mgb_assert(PyDict_SetItemString(
        py_type.tp_dict, "Mode", reinterpret_cast<PyObject*>(e_type)) >= 0);
 }

 void _init_py_RegionRestrictedConvolutionBackwardData_Sparse(PyTypeObject& py_type) {
    auto& e_type = EnumWrapper<RegionRestrictedConvolutionBackwardData::Sparse>::type;

    Py_INCREF(e_type);
    mgb_assert(PyDict_SetItemString(
        py_type.tp_dict, "Sparse", reinterpret_cast<PyObject*>(e_type)) >= 0);
 }

 void _init_py_RegionRestrictedConvolutionBackwardData_Format(PyTypeObject& py_type) {
    auto& e_type = EnumWrapper<RegionRestrictedConvolutionBackwardData::Format>::type;

    Py_INCREF(e_type);
    mgb_assert(PyDict_SetItemString(
        py_type.tp_dict, "Format", reinterpret_cast<PyObject*>(e_type)) >= 0);
 }

 void _init_py_RegionRestrictedConvolutionBackwardData_ComputeMode(PyTypeObject& py_type) {
    auto& e_type = EnumWrapper<RegionRestrictedConvolutionBackwardData::ComputeMode>::type;

    Py_INCREF(e_type);
    mgb_assert(PyDict_SetItemString(
        py_type.tp_dict, "ComputeMode", reinterpret_cast<PyObject*>(e_type)) >= 0);
 }

 PyOpDefBegin(RegionRestrictedConvolutionBackwardData) // {
    static PyGetSetDef py_getsetters[];
    static PyMethodDef tp_methods[];
    
    static PyObject* getstate(PyObject* self, PyObject*) {
        auto& opdef = reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst();
        static_cast<void>(opdef);
        std::unordered_map<std::string, py::object> state {
            
            {"mode", serialization<decltype(opdef.mode)>::dump(opdef.mode)},
            {"pad_h", serialization<decltype(opdef.pad_h)>::dump(opdef.pad_h)},
            {"pad_w", serialization<decltype(opdef.pad_w)>::dump(opdef.pad_w)},
            {"stride_h", serialization<decltype(opdef.stride_h)>::dump(opdef.stride_h)},
            {"stride_w", serialization<decltype(opdef.stride_w)>::dump(opdef.stride_w)},
            {"dilate_h", serialization<decltype(opdef.dilate_h)>::dump(opdef.dilate_h)},
            {"dilate_w", serialization<decltype(opdef.dilate_w)>::dump(opdef.dilate_w)},
            {"sparse", serialization<decltype(opdef.sparse)>::dump(opdef.sparse)},
            {"format", serialization<decltype(opdef.format)>::dump(opdef.format)},
            {"compute_mode", serialization<decltype(opdef.compute_mode)>::dump(opdef.compute_mode)}
        };
        return py::cast(state).release().ptr();
    }
    static PyObject* setstate(PyObject* self, PyObject* args) {
        PyObject* dict = PyTuple_GetItem(args, 0);
        if (!dict) return NULL;
        auto state = py::cast<std::unordered_map<std::string, py::object>>(dict);
        auto& opdef = reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst();
        static_cast<void>(opdef);
        
        {
        auto&& iter = state.find("mode");
        if (iter != state.end()) {
            opdef.mode = serialization<decltype(opdef.mode)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("pad_h");
        if (iter != state.end()) {
            opdef.pad_h = serialization<decltype(opdef.pad_h)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("pad_w");
        if (iter != state.end()) {
            opdef.pad_w = serialization<decltype(opdef.pad_w)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("stride_h");
        if (iter != state.end()) {
            opdef.stride_h = serialization<decltype(opdef.stride_h)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("stride_w");
        if (iter != state.end()) {
            opdef.stride_w = serialization<decltype(opdef.stride_w)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("dilate_h");
        if (iter != state.end()) {
            opdef.dilate_h = serialization<decltype(opdef.dilate_h)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("dilate_w");
        if (iter != state.end()) {
            opdef.dilate_w = serialization<decltype(opdef.dilate_w)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("sparse");
        if (iter != state.end()) {
            opdef.sparse = serialization<decltype(opdef.sparse)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("format");
        if (iter != state.end()) {
            opdef.format = serialization<decltype(opdef.format)>::load(iter->second);
        }
        }

        {
        auto&& iter = state.find("compute_mode");
        if (iter != state.end()) {
            opdef.compute_mode = serialization<decltype(opdef.compute_mode)>::load(iter->second);
        }
        }
        Py_RETURN_NONE;
    }
    static int py_init(PyObject *self, PyObject *args, PyObject *kwds);
 // };
 PyOpDefEnd(RegionRestrictedConvolutionBackwardData)

 int PyOp(RegionRestrictedConvolutionBackwardData)::py_init(PyObject *self, PyObject *args, PyObject *kwds) {
    static const char* kwlist[] = {"mode", "pad_h", "pad_w", "stride_h", "stride_w", "dilate_h", "dilate_w", "sparse", "format", "compute_mode", "scope", NULL};
    PyObject *mode = NULL, *pad_h = NULL, *pad_w = NULL, *stride_h = NULL, *stride_w = NULL, *dilate_h = NULL, *dilate_w = NULL, *sparse = NULL, *format = NULL, *compute_mode = NULL, *scope = NULL;
    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OOOOOOOOOOO", const_cast<char**>(kwlist), &mode, &pad_h, &pad_w, &stride_h, &stride_w, &dilate_h, &dilate_w, &sparse, &format, &compute_mode, &scope))
    return -1;

    if (mode) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().mode =
                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::mode)>(py::handle(mode));
        } CATCH_ALL(-1)
    }

    if (pad_h) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().pad_h =
                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::pad_h)>(py::handle(pad_h));
        } CATCH_ALL(-1)
    }

    if (pad_w) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().pad_w =
                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::pad_w)>(py::handle(pad_w));
        } CATCH_ALL(-1)
    }

    if (stride_h) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().stride_h =
                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::stride_h)>(py::handle(stride_h));
        } CATCH_ALL(-1)
    }

    if (stride_w) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().stride_w =
                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::stride_w)>(py::handle(stride_w));
        } CATCH_ALL(-1)
    }

    if (dilate_h) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().dilate_h =
                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::dilate_h)>(py::handle(dilate_h));
        } CATCH_ALL(-1)
    }

    if (dilate_w) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().dilate_w =
                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::dilate_w)>(py::handle(dilate_w));
        } CATCH_ALL(-1)
    }

    if (sparse) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().sparse =
                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::sparse)>(py::handle(sparse));
        } CATCH_ALL(-1)
    }

    if (format) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().format =
                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::format)>(py::handle(format));
        } CATCH_ALL(-1)
    }

    if (compute_mode) {
        try {
            // TODO: remove this guard which is used for pybind11 implicit conversion
            py::detail::loader_life_support guard{};
            reinterpret_cast<PyOp(RegionRestrictedConvolutionBackwardData)*>(self)->inst().compute_mode =
                    py::cast<decltype(RegionRestrictedConvolutionBackwardData::compute_mode)>(py::handle(compute_mode));
        } CATCH_ALL(-1)
    }

    if (scope) {
        try {
            reinterpret_cast<PyOp(OpDef)*>(self)->op
                ->set_scope(py::cast<std::string>(py::handle(scope)));
        } CATCH_ALL(-1)
    }

    return 0;
 }

 PyGetSetDef PyOp(RegionRestrictedConvolutionBackwardData)::py_getsetters[] = {
    {const_cast<char*>("mode"), py_get_generic(RegionRestrictedConvolutionBackwardData, mode), py_set_generic(RegionRestrictedConvolutionBackwardData, mode), const_cast<char*>("mode"), NULL},
    {const_cast<char*>("pad_h"), py_get_generic(RegionRestrictedConvolutionBackwardData, pad_h), py_set_generic(RegionRestrictedConvolutionBackwardData, pad_h), const_cast<char*>("pad_h"), NULL},
    {const_cast<char*>("pad_w"), py_get_generic(RegionRestrictedConvolutionBackwardData, pad_w), py_set_generic(RegionRestrictedConvolutionBackwardData, pad_w), const_cast<char*>("pad_w"), NULL},
    {const_cast<char*>("stride_h"), py_get_generic(RegionRestrictedConvolutionBackwardData, stride_h), py_set_generic(RegionRestrictedConvolutionBackwardData, stride_h), const_cast<char*>("stride_h"), NULL},
    {const_cast<char*>("stride_w"), py_get_generic(RegionRestrictedConvolutionBackwardData, stride_w), py_set_generic(RegionRestrictedConvolutionBackwardData, stride_w), const_cast<char*>("stride_w"), NULL},
    {const_cast<char*>("dilate_h"), py_get_generic(RegionRestrictedConvolutionBackwardData, dilate_h), py_set_generic(RegionRestrictedConvolutionBackwardData, dilate_h), const_cast<char*>("dilate_h"), NULL},
    {const_cast<char*>("dilate_w"), py_get_generic(RegionRestrictedConvolutionBackwardData, dilate_w), py_set_generic(RegionRestrictedConvolutionBackwardData, dilate_w), const_cast<char*>("dilate_w"), NULL},
    {const_cast<char*>("sparse"), py_get_generic(RegionRestrictedConvolutionBackwardData, sparse), py_set_generic(RegionRestrictedConvolutionBackwardData, sparse), const_cast<char*>("sparse"), NULL},
    {const_cast<char*>("format"), py_get_generic(RegionRestrictedConvolutionBackwardData, format), py_set_generic(RegionRestrictedConvolutionBackwardData, format), const_cast<char*>("format"), NULL},
    {const_cast<char*>("compute_mode"), py_get_generic(RegionRestrictedConvolutionBackwardData, compute_mode), py_set_generic(RegionRestrictedConvolutionBackwardData, compute_mode), const_cast<char*>("compute_mode"), NULL},
    {NULL}  /* Sentinel */
 };

    PyMethodDef PyOp(RegionRestrictedConvolutionBackwardData)::tp_methods[] = {
        {const_cast<char*>("__getstate__"), PyOp(RegionRestrictedConvolutionBackwardData)::getstate, METH_NOARGS, "RegionRestrictedConvolutionBackwardData getstate"},
    {const_cast<char*>("__setstate__"), PyOp(RegionRestrictedConvolutionBackwardData)::setstate, METH_VARARGS, "RegionRestrictedConvolutionBackwardData setstate"},
        {NULL}  /* Sentinel */
    };
    
 void _init_py_RegionRestrictedConvolutionBackwardData(py::module m) {
    using py_op = PyOp(RegionRestrictedConvolutionBackwardData);
    auto& py_type = PyOpType(RegionRestrictedConvolutionBackwardData);
    py_type = {PyVarObject_HEAD_INIT(NULL, 0)};
    py_type.tp_name = "megengine.core._imperative_rt.ops.RegionRestrictedConvolutionBackwardData";
    py_type.tp_basicsize = sizeof(PyOp(RegionRestrictedConvolutionBackwardData));
    py_type.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
    py_type.tp_doc = "RegionRestrictedConvolutionBackwardData";
    py_type.tp_base = &PyOpType(OpDef);
    py_type.tp_dealloc = py_dealloc_generic<py_op>;
    py_type.tp_new = py_new_generic<py_op>;
    py_type.tp_init = py_op::py_init;
    py_type.tp_methods = py_op::tp_methods;
    py_type.tp_getset = py_op::py_getsetters;
    mgb_assert(PyType_Ready(&py_type) >= 0);
        _init_py_RegionRestrictedConvolutionBackwardData_Mode(py_type);
    _init_py_RegionRestrictedConvolutionBackwardData_Sparse(py_type);
    _init_py_RegionRestrictedConvolutionBackwardData_Format(py_type);
    _init_py_RegionRestrictedConvolutionBackwardData_ComputeMode(py_type);

    PyType_Modified(&py_type);
    m.add_object("RegionRestrictedConvolutionBackwardData", reinterpret_cast<PyObject*>(&py_type));
    mgb_assert(PyOp(OpDef)::ctype2pytype.emplace(RegionRestrictedConvolutionBackwardData::typeinfo(), &py_type).second);
 }

 template<> struct EnumTrait<Remap::InterpolationMode> {
    static constexpr const char *name = "Remap.InterpolationMode";
    static constexpr std::underlying_type_t<Remap::InterpolationMode> max = 5 - 1;
@@ -18700,6 +19274,8 @@ void _init_py_WarpPerspectiveBackwardMat(py::module m) {
    _init_py_ROIAlign(m); \
    _init_py_ROIPooling(m); \
    _init_py_Reduce(m); \
    _init_py_RegionRestrictedConvolution(m); \
    _init_py_RegionRestrictedConvolutionBackwardData(m); \
    _init_py_Remap(m); \
    _init_py_RemoteRecv(m); \
    _init_py_RemoteSend(m); \
--- a/imperative/tablegen/generated/opdef.h.inl
+++ b/imperative/tablegen/generated/opdef.h.inl
@@ -1517,6 +1517,58 @@ public:
    }
 };

 class RegionRestrictedConvolution : public OpDefImplBase<RegionRestrictedConvolution> {
    MGB_DYN_TYPE_OBJ_FINAL_DECL;

 public:
    using Mode = ::megdnn::param::Convolution::Mode;
    using Sparse = ::megdnn::param::Convolution::Sparse;
    using Format = ::megdnn::param::Convolution::Format;
    using ComputeMode = ::megdnn::param::Convolution::ComputeMode;
    Mode mode = ::megdnn::param::Convolution::Mode::CROSS_CORRELATION;
    uint32_t pad_h = 0;
    uint32_t pad_w = 0;
    uint32_t stride_h = 1;
    uint32_t stride_w = 1;
    uint32_t dilate_h = 1;
    uint32_t dilate_w = 1;
    Sparse sparse = ::megdnn::param::Convolution::Sparse::DENSE;
    Format format = ::megdnn::param::Convolution::Format::NCHW;
    ComputeMode compute_mode = ::megdnn::param::Convolution::ComputeMode::DEFAULT;
    RegionRestrictedConvolution() = default;
    RegionRestrictedConvolution(Mode mode_, uint32_t pad_h_, uint32_t pad_w_, uint32_t stride_h_, uint32_t stride_w_, uint32_t dilate_h_, uint32_t dilate_w_, Sparse sparse_, Format format_, ComputeMode compute_mode_, std::string scope_ = {}): mode(mode_), pad_h(pad_h_), pad_w(pad_w_), stride_h(stride_h_), stride_w(stride_w_), dilate_h(dilate_h_), dilate_w(dilate_w_), sparse(sparse_), format(format_), compute_mode(compute_mode_) { set_scope(scope_); }
    RegionRestrictedConvolution(::megdnn::param::Convolution packed_param_0): mode(packed_param_0.mode), pad_h(packed_param_0.pad_h), pad_w(packed_param_0.pad_w), stride_h(packed_param_0.stride_h), stride_w(packed_param_0.stride_w), dilate_h(packed_param_0.dilate_h), dilate_w(packed_param_0.dilate_w), sparse(packed_param_0.sparse), format(packed_param_0.format), compute_mode(packed_param_0.compute_mode) {}
    ::megdnn::param::Convolution param() const {
        return {mode, pad_h, pad_w, stride_h, stride_w, dilate_h, dilate_w, sparse, format, compute_mode};
    }
 };

 class RegionRestrictedConvolutionBackwardData : public OpDefImplBase<RegionRestrictedConvolutionBackwardData> {
    MGB_DYN_TYPE_OBJ_FINAL_DECL;

 public:
    using Mode = ::megdnn::param::Convolution::Mode;
    using Sparse = ::megdnn::param::Convolution::Sparse;
    using Format = ::megdnn::param::Convolution::Format;
    using ComputeMode = ::megdnn::param::Convolution::ComputeMode;
    Mode mode = ::megdnn::param::Convolution::Mode::CROSS_CORRELATION;
    uint32_t pad_h = 0;
    uint32_t pad_w = 0;
    uint32_t stride_h = 1;
    uint32_t stride_w = 1;
    uint32_t dilate_h = 1;
    uint32_t dilate_w = 1;
    Sparse sparse = ::megdnn::param::Convolution::Sparse::DENSE;
    Format format = ::megdnn::param::Convolution::Format::NCHW;
    ComputeMode compute_mode = ::megdnn::param::Convolution::ComputeMode::DEFAULT;
    RegionRestrictedConvolutionBackwardData() = default;
    RegionRestrictedConvolutionBackwardData(Mode mode_, uint32_t pad_h_, uint32_t pad_w_, uint32_t stride_h_, uint32_t stride_w_, uint32_t dilate_h_, uint32_t dilate_w_, Sparse sparse_, Format format_, ComputeMode compute_mode_, std::string scope_ = {}): mode(mode_), pad_h(pad_h_), pad_w(pad_w_), stride_h(stride_h_), stride_w(stride_w_), dilate_h(dilate_h_), dilate_w(dilate_w_), sparse(sparse_), format(format_), compute_mode(compute_mode_) { set_scope(scope_); }
    RegionRestrictedConvolutionBackwardData(::megdnn::param::Convolution packed_param_0): mode(packed_param_0.mode), pad_h(packed_param_0.pad_h), pad_w(packed_param_0.pad_w), stride_h(packed_param_0.stride_h), stride_w(packed_param_0.stride_w), dilate_h(packed_param_0.dilate_h), dilate_w(packed_param_0.dilate_w), sparse(packed_param_0.sparse), format(packed_param_0.format), compute_mode(packed_param_0.compute_mode) {}
    ::megdnn::param::Convolution param() const {
        return {mode, pad_h, pad_w, stride_h, stride_w, dilate_h, dilate_w, sparse, format, compute_mode};
    }
 };

 class Remap : public OpDefImplBase<Remap> {
    MGB_DYN_TYPE_OBJ_FINAL_DECL;

--- a/imperative/tablegen/generated/opdef.py.inl
+++ b/imperative/tablegen/generated/opdef.py.inl
@@ -1620,6 +1620,52 @@ ReduceInst
    .def_readwrite("data_type", &Reduce::data_type)
    .def_readwrite("keepdim", &Reduce::keepdim);

 py::class_<RegionRestrictedConvolution, std::shared_ptr<RegionRestrictedConvolution>, OpDef> RegionRestrictedConvolutionInst(m, "RegionRestrictedConvolution");

 RegionRestrictedConvolutionInst.attr("Mode") = BatchConvBiasInst.attr("Mode");

 RegionRestrictedConvolutionInst.attr("Sparse") = BatchConvBiasInst.attr("Sparse");

 RegionRestrictedConvolutionInst.attr("Format") = AdaptivePoolingInst.attr("Format");

 RegionRestrictedConvolutionInst.attr("ComputeMode") = BatchConvBiasInst.attr("ComputeMode");

 RegionRestrictedConvolutionInst
    .def(py::init<::megdnn::param::Convolution::Mode, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, ::megdnn::param::Convolution::Sparse, ::megdnn::param::Convolution::Format, ::megdnn::param::Convolution::ComputeMode, std::string>(), py::arg("mode") = ::megdnn::param::Convolution::Mode::CROSS_CORRELATION, py::arg("pad_h") = 0, py::arg("pad_w") = 0, py::arg("stride_h") = 1, py::arg("stride_w") = 1, py::arg("dilate_h") = 1, py::arg("dilate_w") = 1, py::arg("sparse") = ::megdnn::param::Convolution::Sparse::DENSE, py::arg("format") = ::megdnn::param::Convolution::Format::NCHW, py::arg("compute_mode") = ::megdnn::param::Convolution::ComputeMode::DEFAULT, py::arg("scope") = {})
    .def_readwrite("mode", &RegionRestrictedConvolution::mode)
    .def_readwrite("pad_h", &RegionRestrictedConvolution::pad_h)
    .def_readwrite("pad_w", &RegionRestrictedConvolution::pad_w)
    .def_readwrite("stride_h", &RegionRestrictedConvolution::stride_h)
    .def_readwrite("stride_w", &RegionRestrictedConvolution::stride_w)
    .def_readwrite("dilate_h", &RegionRestrictedConvolution::dilate_h)
    .def_readwrite("dilate_w", &RegionRestrictedConvolution::dilate_w)
    .def_readwrite("sparse", &RegionRestrictedConvolution::sparse)
    .def_readwrite("format", &RegionRestrictedConvolution::format)
    .def_readwrite("compute_mode", &RegionRestrictedConvolution::compute_mode);

 py::class_<RegionRestrictedConvolutionBackwardData, std::shared_ptr<RegionRestrictedConvolutionBackwardData>, OpDef> RegionRestrictedConvolutionBackwardDataInst(m, "RegionRestrictedConvolutionBackwardData");

 RegionRestrictedConvolutionBackwardDataInst.attr("Mode") = BatchConvBiasInst.attr("Mode");

 RegionRestrictedConvolutionBackwardDataInst.attr("Sparse") = BatchConvBiasInst.attr("Sparse");

 RegionRestrictedConvolutionBackwardDataInst.attr("Format") = AdaptivePoolingInst.attr("Format");

 RegionRestrictedConvolutionBackwardDataInst.attr("ComputeMode") = BatchConvBiasInst.attr("ComputeMode");

 RegionRestrictedConvolutionBackwardDataInst
    .def(py::init<::megdnn::param::Convolution::Mode, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, ::megdnn::param::Convolution::Sparse, ::megdnn::param::Convolution::Format, ::megdnn::param::Convolution::ComputeMode, std::string>(), py::arg("mode") = ::megdnn::param::Convolution::Mode::CROSS_CORRELATION, py::arg("pad_h") = 0, py::arg("pad_w") = 0, py::arg("stride_h") = 1, py::arg("stride_w") = 1, py::arg("dilate_h") = 1, py::arg("dilate_w") = 1, py::arg("sparse") = ::megdnn::param::Convolution::Sparse::DENSE, py::arg("format") = ::megdnn::param::Convolution::Format::NCHW, py::arg("compute_mode") = ::megdnn::param::Convolution::ComputeMode::DEFAULT, py::arg("scope") = {})
    .def_readwrite("mode", &RegionRestrictedConvolutionBackwardData::mode)
    .def_readwrite("pad_h", &RegionRestrictedConvolutionBackwardData::pad_h)
    .def_readwrite("pad_w", &RegionRestrictedConvolutionBackwardData::pad_w)
    .def_readwrite("stride_h", &RegionRestrictedConvolutionBackwardData::stride_h)
    .def_readwrite("stride_w", &RegionRestrictedConvolutionBackwardData::stride_w)
    .def_readwrite("dilate_h", &RegionRestrictedConvolutionBackwardData::dilate_h)
    .def_readwrite("dilate_w", &RegionRestrictedConvolutionBackwardData::dilate_w)
    .def_readwrite("sparse", &RegionRestrictedConvolutionBackwardData::sparse)
    .def_readwrite("format", &RegionRestrictedConvolutionBackwardData::format)
    .def_readwrite("compute_mode", &RegionRestrictedConvolutionBackwardData::compute_mode);

 py::class_<Remap, std::shared_ptr<Remap>, OpDef> RemapInst(m, "Remap");

 py::enum_<Remap::InterpolationMode>(RemapInst, "InterpolationMode")
--- a/src/core/include/megbrain/ir/ops.td
+++ b/src/core/include/megbrain/ir/ops.td
@@ -520,4 +520,9 @@ def MeshGrid: MgbHashableOp<"MeshGrid"> {
  MgbStringAttr:$indexing
  );
 }

 def RegionRestrictedConvolution: MgbHashableOp<"RegionRestrictedConvolution", [ConvolutionParam]>;

 def RegionRestrictedConvolutionBackwardData: MgbHashableOp<"RegionRestrictedConvolutionBackwardData", [ConvolutionParam]>;

 #endif // MGB_OPS
--- a/src/opr/impl/dnn/convolution.cpp
+++ b/src/opr/impl/dnn/convolution.cpp
@@ -25,6 +25,58 @@ using namespace cg::static_infer;
 using intl::WorkspaceLimitGetter;

 /* ==================== misc impl  ==================== */
 template <typename MGBOPR, typename DNNOPR>
 void mixin::RegionConvBackwardDataMixin::init_output_static_infer_desc_for_bwd_data(
        cg::OperatorNodeBase* self) {
    using namespace cg::static_infer;
    auto&& mgr = self->owner_graph()->static_infer_manager();

    DepVal inp_deps;
    inp_deps.reserve(6);
    for (int i = 0; i < 4; i++) {
        inp_deps.push_back({self->input(i), DepType::SHAPE});
    }

    auto infer_shp = [self](TensorShape& dest, const InpVal& inp) {
        TensorLayout ol{self->output(0)->dtype()};
        mgb_assert(
                self->input(0)->dtype().category() == DTypeCategory::FLOAT &&
                        self->input(1)->dtype().category() == DTypeCategory::FLOAT &&
                        self->input(2)->dtype().category() == DTypeCategory::INT &&
                        self->input(3)->dtype().category() == DTypeCategory::INT,
                "region conv dtype assert error!");
        static_cast<MGBOPR*>(self)->megdnn_opr()->deduce_layout(
                {inp.val.at(0).shape(), self->input(0)->dtype()},  // filter
                {inp.val.at(1).shape(), self->input(1)->dtype()},  // diff
                {inp.val.at(2).shape(), self->input(2)->dtype()},  // rin
                {inp.val.at(3).shape(), self->input(3)->dtype()},  // rout
                ol                                                 // grad
        );
        dest = ol;
        return true;
    };
    mgr.register_shape_infer(self->output(0), {SourceType::DEP, inp_deps, infer_shp});

    // workspace size
    auto infer_wk = [self](TensorShape& dest, const InpVal& inp) {
        TensorLayout ol{self->output(0)->dtype()};
        dest.ndim = 1;
        dest.shape[0] =
                static_cast<MGBOPR*>(self)->megdnn_opr()->get_workspace_in_bytes(
                        {self->input(0)->shape(), self->input(0)->dtype()},  // filter
                        {self->input(1)->shape(), self->input(1)->dtype()},  // diff
                        {self->input(2)->shape(), self->input(2)->dtype()},  // rin
                        {self->input(3)->shape(), self->input(3)->dtype()},  // rout
                        ol);
        return true;
    };
    inp_deps.push_back({self->output(0), DepType::SHAPE});
    auto workspace_dep_var =
            intl::WorkspaceLimitGetter::register_to_graph(self->owner_graph());
    if (workspace_dep_var)
        inp_deps.push_back({workspace_dep_var, DepType::VALUE});
    mgr.register_shape_infer(self->output(1), {SourceType::DEP, inp_deps, infer_wk});
 }

 template <class MgbOpr, class MegDNNOpr>
 void mixin::ConvolutionBackwardDataMixin::init_output_static_infer_desc_for_bwd_data(
@@ -1535,6 +1587,226 @@ void BatchConvBiasForward::init_output_format() {
    output(0)->format(input(0)->format());
 }

 /* ========================== RegionRestrictedConvolutionForward
 * ========================== */

 IMPL_CONV(RegionRestrictedConvolutionForward);

 RegionRestrictedConvolutionForward::RegionRestrictedConvolutionForward(
        VarNode* src, VarNode* filter, VarNode* region_in, VarNode* region_out,
        const Param& param, const OperatorNodeConfig& config)
        : Super(src->owner_graph(), config, "region_restricted_conv_fwd",
                {src, filter, region_in, region_out}) {
    init_megdnn_opr(*this, param);
    add_input({src, filter, region_in, region_out});
 }

 SymbolVar RegionRestrictedConvolutionForward::make(
        SymbolVar src, SymbolVar filter, SymbolVar region_in, SymbolVar region_out,
        const Param& param, const OperatorNodeConfig& config) {
    return src.insert_single_output_opr<RegionRestrictedConvolutionForward>(
            src.node(), filter.node(), region_in.node(), region_out.node(), param,
            config);
 }

 void RegionRestrictedConvolutionForward::init_output_dtype() {
    mgb_assert(
            input(0)->dtype().category() == DTypeCategory::FLOAT,
            "input dtype only support FLOAT, \
            but got input dtype: %s",
            input(0)->dtype().name());
    output(0)->dtype(input(0)->dtype());
    return;
 }

 size_t RegionRestrictedConvolutionForward::get_workspace_size_bytes(
        const TensorShapeArray& input_shapes,
        const TensorShapeArray& output_shapes) const {
    return megdnn_opr()->get_workspace_in_bytes(
            {input_shapes[0], input(0)->dtype(), input(0)->format()},
            {input_shapes[1], input(1)->dtype(), input(1)->format()},
            {input_shapes[2], input(2)->dtype(), input(2)->format()},
            {input_shapes[3], input(3)->dtype(), input(3)->format()},
            {output_shapes[0], output(0)->dtype(), output(0)->format()});
 }

 #if MGB_ENABLE_GRAD
 MGB_IMPL_OPR_GRAD(RegionRestrictedConvolutionForward) {
    mgb_assert(
            opr.input(0)->dtype().category() == DTypeCategory::FLOAT &&
                    opr.input(1)->dtype().category() == DTypeCategory::FLOAT &&
                    opr.input(2)->dtype().category() == DTypeCategory::INT &&
                    opr.input(3)->dtype().category() == DTypeCategory::INT,
            "only float data type supported for grad");
    if (wrt_idx == 0) {  // src
        SymbolVar grad = RegionRestrictedConvolutionBackwardData::make(
                opr.input(1),  // filter
                out_grad[0],   // diff
                opr.input(2),  // rin
                opr.input(3),  // rout
                opr.input(0),  // src
                opr.param());
        return grad.node();
    }
    // TODO: CUDA WGRAD UNIMPLEMENTED!
    if (wrt_idx == 1) {  // filter
        SymbolVar grad = RegionRestrictedConvolutionBackwardFilter::make(
                opr.input(0),  // src
                out_grad[0],   // diff
                opr.input(2),  // rin
                opr.input(3),  // rout
                opr.input(1),  // filter
                opr.param());
        return grad.node();
    }
    return nullptr;
 }
 #endif

 /* ========================== RegionRestrictedConvolutionBackwardData
 * ========================== */
 IMPL_CONV(RegionRestrictedConvolutionBackwardData);

 RegionRestrictedConvolutionBackwardData::RegionRestrictedConvolutionBackwardData(
        VarNode* filter, VarNode* diff, VarNode* region_in, VarNode* region_out,
        VarNode* src, const Param& param, const OperatorNodeConfig& config)
        : Super{filter->owner_graph(),
                config,
                "region_restricted_conv_bwd_data",
                {filter, diff, region_in, region_out}} {
    init_megdnn_opr(*this, param);
    add_input({filter, diff, region_in, region_out});
    if (src)
        add_input({src});
 }

 SymbolVar RegionRestrictedConvolutionBackwardData::make(
        SymbolVar filter, SymbolVar diff, SymbolVar region_in, SymbolVar region_out,
        SymbolVar src, const Param& param, const OperatorNodeConfig& config) {
    return filter.insert_single_output_opr<RegionRestrictedConvolutionBackwardData>(
            filter.node(), diff.node(), region_in.node(), region_out.node(), src.node(),
            param, config);
 }

 SymbolVar RegionRestrictedConvolutionBackwardData::make(
        SymbolVar filter, SymbolVar diff, SymbolVar region_in, SymbolVar region_out,
        const Param& param, const OperatorNodeConfig& config) {
    return make(filter, diff, region_in, region_out, {}, param, config);
 }

 void RegionRestrictedConvolutionBackwardData::init_output_static_infer_desc() {
    init_output_static_infer_desc_for_bwd_data<
            RegionRestrictedConvolutionBackwardData,
            megdnn::RegionRestrictedConvolutionBackwardData>(this);
 }

 void RegionRestrictedConvolutionBackwardData::init_output_dtype() {
    output(0)->dtype(input(0)->dtype());
 }

 void RegionRestrictedConvolutionBackwardData::scn_do_execute() {
    megdnn_opr()->exec(
            input(0)->dev_tensor().as_megdnn(),  // filter
            input(1)->dev_tensor().as_megdnn(),  // diff
            input(2)->dev_tensor().as_megdnn(),  // rin
            input(3)->dev_tensor().as_megdnn(),  // rout
            output(0)->dev_tensor().as_megdnn(),
            intl::get_megdnn_workspace_from_var(output().back()));
 }

 cg::OperatorNodeBase::NodeProp* RegionRestrictedConvolutionBackwardData::
        do_make_node_prop() const {
    auto prop = Super::Super::do_make_node_prop();
    if (input().size() == 5) {
        using D = NodeProp::DepType;
        prop->reset_dep_type(
                input(),
                {D::DEV_VALUE, D::DEV_VALUE, D::DEV_VALUE, D::DEV_VALUE, D::SHAPE});
    }
    return prop;
 }

 #if MGB_ENABLE_GRAD
 MGB_IMPL_OPR_GRAD(RegionRestrictedConvolutionBackwardData) {
    if (wrt_idx == 0) {  // filter
        return RegionRestrictedConvolutionBackwardFilter::make(
                       out_grad[0], opr.input(1), opr.input(2), opr.input(3),
                       opr.input(0), opr.param())
                .node();
    }
    if (wrt_idx == 1) {  // diff
        return RegionRestrictedConvolution::make(
                       out_grad[0], opr.input(0), opr.input(2), opr.input(3),
                       opr.param())
                .node();
    }
    return nullptr;
 }
 #endif

 /* ========================== RegionRestrictedConvolutionBackwardFilter
 * ========================== */
 IMPL_CONV(RegionRestrictedConvolutionBackwardFilter);

 RegionRestrictedConvolutionBackwardFilter::RegionRestrictedConvolutionBackwardFilter(
        VarNode* src, VarNode* diff, VarNode* region_in, VarNode* region_out,
        VarNode* filter, const Param& param, const OperatorNodeConfig& config)
        : Super({src->owner_graph(),
                 config,
                 "region_restricted_conv_bwd_filter",
                 {src, diff, region_in, region_out, filter}},
                4, false) {
    init_megdnn_opr(*this, param);
    add_input({src, diff, region_in, region_out, filter});
 }

 SymbolVar RegionRestrictedConvolutionBackwardFilter::make(
        SymbolVar src, SymbolVar diff, SymbolVar region_in, SymbolVar region_out,
        SymbolVar filter, const Param& param, const OperatorNodeConfig& config) {
    return src.insert_single_output_opr<RegionRestrictedConvolutionBackwardFilter>(
            src.node(), diff.node(), region_in.node(), region_out.node(), filter.node(),
            param, config);
 }

 size_t RegionRestrictedConvolutionBackwardFilter::get_workspace_size_bytes(
        const TensorShapeArray& input_shapes,
        const TensorShapeArray& output_shapes) const {
    return megdnn_opr()->get_workspace_in_bytes(
            {input_shapes[0], input(0)->dtype(), input(0)->format()},
            {input_shapes[1], input(1)->dtype(), input(1)->format()},
            {input_shapes[2], input(2)->dtype(), input(2)->format()},
            {input_shapes[3], input(3)->dtype(), input(3)->format()},
            {output_shapes[0], output(0)->dtype(), output(0)->format()});
 }

 void RegionRestrictedConvolutionBackwardFilter::scn_do_execute() {
    megdnn_opr()->exec(
            input(0)->dev_tensor().as_megdnn(),  // src
            input(1)->dev_tensor().as_megdnn(),  // diff
            input(2)->dev_tensor().as_megdnn(),  // rin
            input(3)->dev_tensor().as_megdnn(),  // rout
            output(0)->dev_tensor().as_megdnn(),
            intl::get_megdnn_workspace_from_var(output().back()));
 }
 #if MGB_ENABLE_GRAD
 MGB_IMPL_OPR_GRAD(RegionRestrictedConvolutionBackwardFilter) {
    if (wrt_idx == 0) {
        return RegionRestrictedConvolutionBackwardData::make(
                       out_grad[0] /*filter*/, opr.input(1) /*diff*/,
                       opr.input(2) /*rin*/, opr.input(3) /*rout*/,
                       opr.input(0) /*src*/, opr.param())
                .node();
    }
    if (wrt_idx == 1) {
        return RegionRestrictedConvolution::make(
                       opr.input(0) /*src*/, out_grad[0] /*filter*/,
                       opr.input(2) /*rin*/, opr.input(3) /*rout*/, opr.param())
                .node();
    }
    return nullptr;
 }
 #endif

 #undef IMPL_CONV

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/opr/impl/dnn/dnn.sereg.h
+++ b/src/opr/impl/dnn/dnn.sereg.h
@@ -431,6 +431,7 @@ struct OprLoadDumpImpl<opr::Convolution3DBackwardFilter, 0>
                  MakeConvCallerEmpty<megdnn::Convolution3D>,
                  MakeConvCallerEmpty<megdnn::Convolution3D>,
                  megdnn::param::Convolution3D> {};

 template <>
 struct OprLoadDumpImpl<opr::ConvBiasForward, 0>
        : public ConvLoadDumpImpl<
--- a/src/opr/impl/dnn/dnn.sereg.v2.h
+++ b/src/opr/impl/dnn/dnn.sereg.v2.h
@@ -194,6 +194,30 @@ struct OprLoadDumpImplV2<opr::DeformableConvBackwardFilter, 0>
                  MakeConvCaller5<megdnn::DeformableConvBackwardFilter>,
                  megdnn::Convolution> {};

 template <>
 struct OprMaker<opr::RegionRestrictedConvolutionBackwardData, 0> {
    using Opr = opr::RegionRestrictedConvolutionBackwardData;
    using Param = Opr::Param;
    static cg::OperatorNodeBase* make(
            const Param& param, const cg::VarNodeArray& inputs, ComputingGraph& graph,
            const OperatorNodeConfig& config) {
        MGB_MARK_USED_VAR(graph);
        if (inputs.size() == 4) {  // deconv mode
            return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3], param, config)
                    .node()
                    ->owner_opr();
        } else if (inputs.size() == 5) {  // dgrad mode
            return Opr::make(
                           inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], param,
                           config)
                    .node()
                    ->owner_opr();
        } else {
            return nullptr;
        }
    }
 };

 }  // namespace serialization

 namespace opr {
@@ -220,6 +244,10 @@ SERGE_OPR_V2_NO_CONVERTER(Convolution3D, 0);
 SERGE_OPR_V2_NO_CONVERTER(Convolution3DBackwardData, 0);
 SERGE_OPR_V2_NO_CONVERTER(Convolution3DBackwardFilter, 0);

 MGB_SEREG_OPR(RegionRestrictedConvolutionBackwardData, 0);
 MGB_SEREG_OPR(RegionRestrictedConvolution, 4);
 MGB_SEREG_OPR(RegionRestrictedConvolutionBackwardFilter, 5);

 SERGE_OPR_V2_NO_CONVERTER(LocalShareForward, 0);
 SERGE_OPR_V2_NO_CONVERTER(LocalShareBackwardData, 0);
 SERGE_OPR_V2_NO_CONVERTER(LocalShareBackwardFilter, 0);
--- a/src/opr/include/megbrain/opr/dnn/convolution.h
+++ b/src/opr/include/megbrain/opr/dnn/convolution.h
@@ -18,6 +18,12 @@ protected:
    static void init_output_static_infer_desc_for_bwd_data(cg::OperatorNodeBase* self);
 };

 class RegionConvBackwardDataMixin : public cg::OperatorNodeMixinBase {
 protected:
    template <typename MGBOPR, typename DNNOPR>
    static void init_output_static_infer_desc_for_bwd_data(cg::OperatorNodeBase* self);
 };

 class WeightPreprocessExecutor : public cg::OperatorNodeMixinBase {
    class PreprocessedFilterExecDep;

@@ -83,6 +89,80 @@ class ConvolutionTestingPeer;

 }  // namespace testing

 /* ==================== RegionRestrictedConvolutionForward  ==================== */
 MGB_DEFINE_OPR_CLASS_WITH_EXPORT(
        RegionRestrictedConvolutionForward,
        intl::MegDNNOprWrapperFwd<megdnn::RegionRestrictedConvolutionForward>) // {
    size_t get_workspace_size_bytes(
            const TensorShapeArray& input_shapes,
            const TensorShapeArray& output_shapes) const override;
    void init_output_dtype() override;

 public:
    MGE_WIN_DECLSPEC_FUC RegionRestrictedConvolutionForward(
            VarNode* src, VarNode* filter, VarNode* region_in, VarNode* region_out,
            const Param& param, const OperatorNodeConfig& config);

    MGE_WIN_DECLSPEC_FUC static SymbolVar make(
            SymbolVar src, SymbolVar filter, SymbolVar region_in, SymbolVar region_out,
            const Param& param, const OperatorNodeConfig& config = {});
 };
 using RegionRestrictedConvolution = RegionRestrictedConvolutionForward;

 /* ==================== RegionRestrictedConvolutionBackwardData  ==================== */
 MGB_DEFINE_OPR_CLASS_WITH_EXPORT(
        RegionRestrictedConvolutionBackwardData,
        cg::SingleCNOperatorNodeBaseT<mixin::MegDNNOprHolderImpl<
                megdnn::RegionRestrictedConvolutionBackwardData>>,
        public mixin::RegionConvBackwardDataMixin) // {
    void scn_do_execute() override;
    void init_output_static_infer_desc() override;
    NodeProp* do_make_node_prop() const override;
    void init_output_dtype() override;

 public:
    MGE_WIN_DECLSPEC_FUC RegionRestrictedConvolutionBackwardData(
            VarNode* filter, VarNode* diff, VarNode* region_in, VarNode* region_out,
            VarNode* src, const Param& param, const OperatorNodeConfig& config);

    // grad mode
    MGE_WIN_DECLSPEC_FUC static SymbolVar make(
            SymbolVar filter, SymbolVar diff, SymbolVar region_in, SymbolVar region_out,
            SymbolVar src, const Param& param, const OperatorNodeConfig& config = {});

    // sereg for deconv mode
    MGE_WIN_DECLSPEC_FUC static SymbolVar make(
            SymbolVar filter, SymbolVar diff, SymbolVar region_in, SymbolVar region_out,
            const Param& param, const OperatorNodeConfig& config = {});

    // user interface for deconv
    MGE_WIN_DECLSPEC_FUC static SymbolVar make_deconv(
            SymbolVar data, SymbolVar filter, SymbolVar region_in, SymbolVar region_out,
            const Param& param = {}, const OperatorNodeConfig& config = {}) {
        return make(filter, data, region_in, region_out, param, config);
    }
 };

 /* ==================== RegionRestrictedConvolutionBackwardFilter  ==================== */
 MGB_DEFINE_OPR_CLASS_WITH_EXPORT(
        RegionRestrictedConvolutionBackwardFilter,
        intl::MegDNNOprWrapperBwd<megdnn::RegionRestrictedConvolutionBackwardFilter>) // {
    size_t get_workspace_size_bytes(
            const TensorShapeArray& input_shapes,
            const TensorShapeArray& output_shapes) const override;
    void scn_do_execute() override;

 public:
    MGE_WIN_DECLSPEC_FUC RegionRestrictedConvolutionBackwardFilter(
            VarNode* src, VarNode* diff, VarNode* region_in, VarNode* region_out,
            VarNode* filter, const Param& param, const OperatorNodeConfig& config);

    MGE_WIN_DECLSPEC_FUC static SymbolVar make(
            SymbolVar src, SymbolVar diff, SymbolVar region_in, SymbolVar region_out,
            SymbolVar filter, const Param& param,
            const OperatorNodeConfig& config = {});
 };

 MGB_DEFINE_OPR_CLASS_WITH_EXPORT(
        ConvolutionForward, intl::ConvolutionForwardBase,
        public mixin::AlgoChooserHelper) // {
--- a/src/opr/test/dnn/region_restricted_convolution.cpp
+++ b/src/opr/test/dnn/region_restricted_convolution.cpp
@@ -0,0 +1,196 @@
 #include "./legacy_checker.h"
 #include "megbrain/comp_node_env.h"

 #include "megbrain/gopt/inference.h"
 #include "megbrain/opr/basic_arith.h"
 #include "megbrain/opr/dnn/convolution.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/serialization/serializer.h"
 #include "megbrain/test/autocheck.h"
 #include "megbrain/test/helper.h"
 #include "megbrain/test/megdnn_helper.h"
 #include "megdnn/algorithm_cache.h"
 #include "megdnn/dtype.h"
 #include "megdnn/oprs/base.h"

 #include <gmock/gmock.h>

 #include <cmath>
 #include <memory>
 #include <random>

 using namespace mgb;

 TEST(TestOprDNN, REGIONCONV_FWD_CPU_WRAPPER) {
    using Checker = AutoOprChecker<4, 1>;
    megdnn::RegionRestrictedConvolution::Param param;
    param.sparse = opr::RegionRestrictedConvolution::Param::Sparse::DENSE;

    auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
        return {opr::RegionRestrictedConvolutionForward::make(
                inputs[0], inputs[1], inputs[2], inputs[3], param)};
    };

    Checker::RunOptions option;
    option.numdiff_eps = 0.1;
    option.numdiff_max_err = 1e-2;

    auto mask_gen = [&](HostTensorND& src) {
        HostTensorGenerator<dtype::Int32, RandomDistribution::CONSTANT> gen(1);
        src = *gen(src.shape(), src.comp_node());
    };
    auto float_gen = [&](HostTensorND& src) {
        HostTensorGenerator<dtype::Float32, RandomDistribution::GAUSSIAN> gen;
        src = *gen(src.shape(), src.comp_node());
    };

    auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
        auto opr =
                megdnn_naive_handle()
                        ->create_operator<megdnn::RegionRestrictedConvolutionForward>();
        opr->param() = param;
        TensorLayout dest_layout;
        opr->deduce_layout(
                inp[0]->layout(), inp[1]->layout(), inp[2]->layout(), inp[3]->layout(),
                dest_layout);
        std::vector<dt_byte> workspace(opr->get_workspace_in_bytes(
                inp[0]->layout(), inp[1]->layout(), inp[2]->layout(), inp[3]->layout(),
                dest_layout));
        dest[0].dtype(inp[0]->dtype())
                .comp_node(inp[0]->comp_node())
                .resize(dest_layout);
        opr->exec(
                inp[0]->as_megdnn(), inp[1]->as_megdnn(), inp[2]->as_megdnn(),
                inp[3]->as_megdnn(), dest[0].as_megdnn(),
                {workspace.data(), workspace.size()});
    };

    Checker(make_graph, fwd, CompNode::load("cpu0"))
            .set_input_dtype(0, dtype::Float32())
            .set_input_dtype(1, dtype::Float32())
            .set_input_dtype(2, dtype::Int32())
            .set_input_dtype(3, dtype::Int32())
            .set_input_generator(0, float_gen)
            .set_input_generator(1, float_gen)
            .set_input_generator(2, mask_gen)
            .set_input_generator(3, mask_gen)
            .set_input_allow_grad(2, false)
            .set_input_allow_grad(3, false)
            // {n,ic,ih,iw}, {oc,ic,fh,fw}, {n,ih,iw}, {n,oh,ow}
            .run({TensorShape{1, 2, 2, 2}, TensorShape{1, 2, 2, 2},
                  TensorShape{1, 2, 2}, TensorShape{1, 1, 1}},
                 option)
            .run({TensorShape{1, 2, 3, 3}, TensorShape{1, 2, 3, 3},
                  TensorShape{1, 3, 3}, TensorShape{1, 1, 1}},
                 option)
            .run({TensorShape{1, 1, 4, 4}, TensorShape{1, 1, 2, 2},
                  TensorShape{1, 4, 4}, TensorShape{1, 3, 3}},
                 option)
            .run({TensorShape{2, 2, 8, 8}, TensorShape{4, 2, 2, 2},
                  TensorShape{2, 8, 8}, TensorShape{2, 7, 7}},
                 option)
            .run({TensorShape{4, 4, 8, 8}, TensorShape{4, 4, 2, 2},
                  TensorShape{4, 8, 8}, TensorShape{4, 7, 7}},
                 option);
 }

 #if MGB_CUDA
 TEST(TestOprDNN, REGIONCONV_FWD_GPU_WRAPPER) {
    using Checker = AutoOprChecker<4, 1>;
    megdnn::RegionRestrictedConvolution::Param param;
    param.sparse = opr::RegionRestrictedConvolution::Param::Sparse::GROUP;

    auto make_graph = [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
        return {opr::RegionRestrictedConvolutionForward::make(
                inputs[0], inputs[1], inputs[2], inputs[3], param)};
    };

    Checker::RunOptions option;
    option.numdiff_eps = 0.1;
    option.numdiff_max_err = 1e-2;

    auto mask_gen = [&](HostTensorND& src) {
        HostTensorGenerator<dtype::Int32, RandomDistribution::CONSTANT> gen(1);
        src = *gen(src.shape(), src.comp_node());
    };
    auto uint8_mask_gen = [&](HostTensorND& src) {
        HostTensorGenerator<dtype::Uint8, RandomDistribution::CONSTANT> gen(1);
        src = *gen(src.shape(), src.comp_node());
    };
    auto float_gen = [&](HostTensorND& src) {
        HostTensorGenerator<dtype::Float32, RandomDistribution::GAUSSIAN> gen;
        src = *gen(src.shape(), src.comp_node());
    };

    auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
        auto opr =
                megdnn_naive_handle()
                        ->create_operator<megdnn::RegionRestrictedConvolutionForward>();
        opr->param() = param;
        TensorLayout dest_layout;
        opr->deduce_layout(
                inp[0]->layout(), inp[1]->layout(), inp[2]->layout(), inp[3]->layout(),
                dest_layout);
        std::vector<dt_byte> workspace(opr->get_workspace_in_bytes(
                inp[0]->layout(), inp[1]->layout(), inp[2]->layout(), inp[3]->layout(),
                dest_layout));
        dest[0].dtype(inp[0]->dtype())
                .comp_node(inp[0]->comp_node())
                .resize(dest_layout);
        opr->exec(
                inp[0]->as_megdnn(), inp[1]->as_megdnn(), inp[2]->as_megdnn(),
                inp[3]->as_megdnn(), dest[0].as_megdnn(),
                {workspace.data(), workspace.size()});
    };

    Checker(make_graph, fwd, CompNode::load("gpu0"))
            .set_input_dtype(0, dtype::Float32())
            .set_input_dtype(1, dtype::Float32())
            .set_input_dtype(2, dtype::Int32())
            .set_input_dtype(3, dtype::Int32())
            .set_input_generator(0, float_gen)
            .set_input_generator(1, float_gen)
            .set_input_generator(2, mask_gen)
            .set_input_generator(3, mask_gen)
            .set_input_allow_grad(2, false)
            .set_input_allow_grad(3, false)
            // {n,ic,ih,iw}, {oc,ic,fh,fw}, {n,ih,iw}, {n,oh,ow}
            .run({TensorShape{1, 2, 2, 2}, TensorShape{2, 1, 1, 2, 2},
                  TensorShape{1, 2, 2}, TensorShape{1, 1, 1}},
                 option)
            .run({TensorShape{1, 2, 3, 3}, TensorShape{2, 1, 1, 3, 3},
                  TensorShape{1, 3, 3}, TensorShape{1, 1, 1}},
                 option)
            .run({TensorShape{1, 4, 4, 4}, TensorShape{4, 1, 1, 2, 2},
                  TensorShape{1, 4, 4}, TensorShape{1, 3, 3}},
                 option)
            .run({TensorShape{2, 4, 8, 8}, TensorShape{4, 1, 1, 2, 2},
                  TensorShape{2, 8, 8}, TensorShape{2, 7, 7}},
                 option);

    Checker(make_graph, fwd, CompNode::load("gpu0"))
            .set_input_dtype(0, dtype::Float32())
            .set_input_dtype(1, dtype::Float32())
            .set_input_dtype(2, dtype::Uint8())
            .set_input_dtype(3, dtype::Uint8())
            .set_input_generator(0, float_gen)
            .set_input_generator(1, float_gen)
            .set_input_generator(2, uint8_mask_gen)
            .set_input_generator(3, uint8_mask_gen)
            .set_input_allow_grad(2, false)
            .set_input_allow_grad(3, false)
            // {n,ic,ih,iw}, {oc,ic,fh,fw}, {n,ih,iw}, {n,oh,ow}
            .run({TensorShape{1, 2, 4, 4}, TensorShape{2, 1, 1, 1, 1},
                  TensorShape{1, 4, 4}, TensorShape{1, 4, 4}},
                 option)
            .run({TensorShape{1, 2, 8, 8}, TensorShape{2, 1, 1, 1, 1},
                  TensorShape{1, 8, 8}, TensorShape{1, 8, 8}},
                 option)
            .run({TensorShape{1, 4, 8, 8}, TensorShape{4, 1, 1, 5, 5},
                  TensorShape{1, 8, 8}, TensorShape{1, 4, 4}},
                 option)
            .run({TensorShape{2, 4, 8, 8}, TensorShape{4, 1, 1, 1, 1},
                  TensorShape{2, 8, 8}, TensorShape{2, 8, 8}},
                 option);
 }
 #endif