feat(mge/module): add python wrapper for unfold

GitOrigin-RevId: 562103186f
5 years ago · 1525a02530
--- a/dnn/scripts/opr_param_defs.py
+++ b/dnn/scripts/opr_param_defs.py
@@ -220,7 +220,7 @@ pdef('Axis').add_fields('int32', 'axis', 0)

 (pdef('Images2Neibs').
 add_fields('uint32', 'pad_h', 0, 'pad_w', 0, 'stride_h', 1, 'stride_w', 1,
            'window_h', 3, 'window_w', 3))
            'dilate_h', 1, 'dilate_w', 1, 'window_h', 3, 'window_w', 3))

 (pdef('Pooling', version=0, is_legacy=True).
 add_enum(
--- a/dnn/src/common/images2neibs.cpp
+++ b/dnn/src/common/images2neibs.cpp
@@ -23,6 +23,8 @@ void Images2NeibsBase::deduce_layout_fwd(const TensorLayout &src,
               "pad_w=" + std::to_string(param().pad_w) + ", " +
               "stride_h=" + std::to_string(param().stride_h) + ", " +
               "stride_w=" + std::to_string(param().stride_w) + ", " +
               "dilate_h=" + std::to_string(param().dilate_h) + ", " +
               "dilate_w=" + std::to_string(param().dilate_w) + ", " +
               "window_h=" + std::to_string(param().window_h) + ", " +
               "window_w=" + std::to_string(param().window_w);
    };
@@ -34,11 +36,13 @@ void Images2NeibsBase::deduce_layout_fwd(const TensorLayout &src,
    size_t pw = this->param().pad_w;
    size_t sh = this->param().stride_h;
    size_t sw = this->param().stride_w;
    size_t dh = this->param().dilate_h;
    size_t dw = this->param().dilate_w;
    size_t wh = this->param().window_h;
    size_t ww = this->param().window_w;
    size_t oh, ow;

    infer_conv_shape2d(ih, iw, wh, ww, sh, sw, ph, pw, oh, ow);
    infer_conv_shape2d(ih, iw, wh+(wh-1)*(dh-1), ww+(ww-1)*(dw-1), sh, sw, ph, pw, oh, ow);
    dst = TensorLayout(TensorShape({n, ic, oh, ow, wh, ww}), src.dtype);
 }

--- a/dnn/src/cuda/images2neibs/kernel.cu
+++ b/dnn/src/cuda/images2neibs/kernel.cu
@@ -24,7 +24,7 @@ namespace images2neibs {
 template <typename T>
 __global__ void forward_kernel(const T *src, T *dst,
        int N, int C, int IH, int IW, int OH, int OW,
        int ph, int pw, int sh, int sw, int WH, int WW)
        int ph, int pw, int sh, int sw, int dh, int dw, int WH, int WW)
 {
    int NC = N * C;
    int WP = WH*WW;
@@ -37,8 +37,8 @@ __global__ void forward_kernel(const T *src, T *dst,
            if (op < OH * OW) {
                int oh = op / OW;
                int ow = op % OW;
                int ih = -ph + sh * oh + wh;
                int iw = -pw + sw * ow + ww;
                int ih = -ph + sh * oh + wh* dh;
                int iw = -pw + sw * ow + ww* dw;
                int dst_pos = nc * OH * OW * WH * WW + op * WH * WW + wp;
                int src_pos = nc * IH * IW + ih * IW + iw;
                dst[dst_pos] = (ih >= 0 && ih < IH && iw >= 0 && iw < IW)
@@ -52,7 +52,7 @@ __global__ void forward_kernel(const T *src, T *dst,

 template <typename T>
 void forward(const T* src, T* dst, int N, int C, int IH, int IW, int OH, int OW,
             int ph, int pw, int sh, int sw, int wh, int ww,
             int ph, int pw, int sh, int sw, int dh, int dw, int wh, int ww,
             cudaStream_t stream) {
    int spatial_size = OH * OW;
    int kernel_size = wh * ww;
@@ -63,7 +63,7 @@ void forward(const T* src, T* dst, int N, int C, int IH, int IW, int OH, int OW,
    int by = N * C;

    forward_kernel<<<dim3(bx, std::min(grid_y_max, by)), dim3(tx, ty), 0,
                     stream>>>(src, dst, N, C, IH, IW, OH, OW, ph, pw, sh, sw,
                     stream>>>(src, dst, N, C, IH, IW, OH, OW, ph, pw, sh, sw, dh, dw,
                               wh, ww);
    after_kernel_launch();
 }
@@ -73,7 +73,7 @@ void forward(const T* src, T* dst, int N, int C, int IH, int IW, int OH, int OW,
 template <typename T>
 __global__ void backward_kernel(const T *diff, T *grad,
        int N, int C, int IH, int IW, int OH, int OW,
        int ph, int pw, int sh, int sw, int WH, int WW)
        int ph, int pw, int sh, int sw, int dh, int dw, int WH, int WW)
 {
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    if (id < N*C*IH*IW) {
@@ -82,17 +82,20 @@ __global__ void backward_kernel(const T *diff, T *grad,
        int iw = id % (IH*IW) % IW;
        grad[nc*IH*IW + ih*IW + iw] = 0.0f;
        int oh_max = min((ih+ph) / sh, OH-1);
        int oh_min = max((ih+ph-(WH-1)+sh-1) / sh, 0);
        int oh_min = max((ih+ph-(WH-1)*dh+sh-1) / sh, 0);
        int ow_max = min((iw+pw) / sw, OW-1);
        int ow_min = max((iw+pw-(WW-1)+sw-1) / sw, 0);
        int ow_min = max((iw+pw-(WW-1)*dw+sw-1) / sw, 0);
        for (int oh = oh_min; oh <= oh_max; ++oh)
        for (int ow = ow_min; ow <= ow_max; ++ow)
        {
            int wh = ih+ph - sh*oh;
            int ww = iw+pw - sw*ow;
            grad[nc*IH*IW + ih*IW + iw] +=
                diff[nc*OH*OW*WH*WW + oh*OW*WH*WW + ow*WH*WW +
                        wh*WW + ww];
            if ((ih+ph - sh*oh)%dh==0 && (iw+pw - sw*ow)%dw==0){
                int wh = ih+ph - sh*oh - (ih+ph - sh*oh)/dh * (dh-1);
                int ww = iw+pw - sw*ow - (iw+pw - sw*ow)/dw * (dw-1);
                grad[nc*IH*IW + ih*IW + iw] +=
                    diff[nc*OH*OW*WH*WW + oh*OW*WH*WW + ow*WH*WW +
                            wh*WW + ww];

            }
        }
    }
 }
@@ -100,23 +103,23 @@ __global__ void backward_kernel(const T *diff, T *grad,
 template <typename T>
 void backward(const T *diff, T *grad,
        int N, int C, int IH, int IW, int OH, int OW,
        int ph, int pw, int sh, int sw, int wh, int ww,
        int ph, int pw, int sh, int sw, int dh, int dw, int wh, int ww,
        cudaStream_t stream)
 {
    int threads = NR_THREADS;
    int blocks = DIVUP(N*C*IH*IW, threads);
    backward_kernel<<<blocks, threads, 0, stream>>>(diff, grad,
            N, C, IH, IW, OH, OW,
            ph, pw, sh, sw, wh, ww);
            ph, pw, sh, sw, dh, dw, wh, ww);
    after_kernel_launch();
 }

 #define INST(T) \
    template void forward<T>(const T *, T *, int, int, int, int, int, int, \
            int, int, int, int, int, int, \
            int, int, int, int, int, int, int, int, \
            cudaStream_t); \
    template void backward<T>(const T *, T *, int, int, int, int, int, int, \
            int, int, int, int, int, int, \
            int, int, int, int, int, int, int, int, \
            cudaStream_t);
 #define cb(DType) \
    INST(DTypeTrait<DType>::ctype)
--- a/dnn/src/cuda/images2neibs/kernel.cuh
+++ b/dnn/src/cuda/images2neibs/kernel.cuh
@@ -18,13 +18,13 @@ namespace images2neibs {
 template <typename T>
 void forward(const T *src, T *dst,
        int N, int C, int IH, int IW, int OH, int OW,
        int ph, int pw, int sh, int sw, int wh, int ww,
        int ph, int pw, int sh, int sw, int dh, int dw, int wh, int ww,
        cudaStream_t stream);

 template <typename T>
 void backward(const T *diff, T *grad,
        int N, int C, int IH, int IW, int OH, int OW,
        int ph, int pw, int sh, int sw, int wh, int ww,
        int ph, int pw, int sh, int sw, int dh, int dw, int wh, int ww,
        cudaStream_t stream);
       
 } // namespace images2neibs
--- a/dnn/src/cuda/images2neibs/opr_impl.cpp
+++ b/dnn/src/cuda/images2neibs/opr_impl.cpp
@@ -27,13 +27,14 @@ void Images2NeibsForwardImpl::exec(_megdnn_tensor_in src,
    int OH = dst.layout[2], OW = dst.layout[3];
    int ph = param().pad_h, pw = param().pad_w;
    int sh = param().stride_h, sw = param().stride_w;
    int dh = param().dilate_h, dw = param().dilate_w;
    int wh = param().window_h, ww = param().window_w;
 #define cb(DType) \
    if (src.layout.dtype.enumv() == DTypeTrait<DType>::enumv) { \
        using T = DTypeTrait<DType>::ctype; \
        images2neibs::forward(src.ptr<T>(), dst.ptr<T>(), \
                N, C, IH, IW, OH, OW, \
                ph, pw, sh, sw, wh, ww, \
                ph, pw, sh, sw, dh, dw, wh, ww, \
                stream); \
        return; \
    }
@@ -53,13 +54,14 @@ void Images2NeibsBackwardImpl::exec(_megdnn_tensor_in diff,
    int OH = diff.layout[2], OW = diff.layout[3];
    int ph = param().pad_h, pw = param().pad_w;
    int sh = param().stride_h, sw = param().stride_w;
    int dh = param().dilate_h, dw = param().dilate_w;
    int wh = param().window_h, ww = param().window_w;
 #define cb(DType) \
    if (diff.layout.dtype == DType()) { \
        using T = DTypeTrait<DType>::ctype; \
        images2neibs::backward(diff.ptr<T>(), grad.ptr<T>(), \
                N, C, IH, IW, OH, OW, \
                ph, pw, sh, sw, wh, ww, \
                ph, pw, sh, sw, dh, dw, wh, ww, \
                stream); \
        return; \
    }
--- a/dnn/src/naive/images2neibs/opr_impl.cpp
+++ b/dnn/src/naive/images2neibs/opr_impl.cpp
@@ -33,20 +33,25 @@ void Images2NeibsForwardImpl::exec_internal(_megdnn_tensor_in src,
    int pad_w = static_cast<int>(param().pad_w);
    int stride_h = static_cast<int>(param().stride_h);
    int stride_w = static_cast<int>(param().stride_w);
    int dilate_h = static_cast<int>(param().dilate_h);
    int dilate_w = static_cast<int>(param().dilate_w);
    int equ_window_h = dilate_h * (window_h-1) + 1;
    int equ_window_w = dilate_w * (window_w-1) + 1;
    for (int n = 0; n < N; ++n)
    for (int c = 0; c < C; ++c)
    {
        int ih = -pad_h;
        for (; ih+window_h <= IH+pad_h; ih += stride_h) {
        for (; ih+equ_window_h <= IH+pad_h; ih += stride_h) {
            int iw = -pad_w;
            for (; iw+window_w <= IW+pad_w; iw += stride_w) {
            for (; iw+equ_window_w <= IW+pad_w; iw += stride_w) {
                for (int kh = 0; kh < window_h; ++kh)
                for (int kw = 0; kw < window_w; ++kw)
                {
                    int ih2 = ih+dilate_h*kh, iw2 = iw+dilate_w*kw;
                    dptr[idx*window_h*window_w + kh*window_w + kw] =
                        (ih+kh) >= 0 && (ih+kh) < IH &&
                        (iw+kw) >= 0 && (iw+kw) < IW ?
                        sptr[n*C*IH*IW + c*IH*IW + (ih+kh)*IW + (iw+kw)] : 0.0f;
                        ih2 >= 0 && ih2 < IH &&
                        iw2 >= 0 && iw2 < IW ?
                        sptr[n*C*IH*IW + c*IH*IW + ih2*IW + iw2] : 0.0f;
                }
                ++idx;
            }
@@ -86,18 +91,22 @@ void Images2NeibsBackwardImpl::exec_internal(_megdnn_tensor_in diff,
    int pad_w = static_cast<int>(param().pad_w);
    int stride_h = static_cast<int>(param().stride_h);
    int stride_w = static_cast<int>(param().stride_w);
    int dilate_h = static_cast<int>(param().dilate_h);
    int dilate_w = static_cast<int>(param().dilate_w);
    int equ_window_h = dilate_h * (window_h-1) + 1;
    int equ_window_w = dilate_w * (window_w-1) + 1;
    memset(sptr, 0, sizeof(T) * N*C*IH*IW);
    for (int n = 0; n < N; ++n)
    for (int c = 0; c < C; ++c)
    {
        int ih = -pad_h;
        for (; ih+window_h <= IH+pad_h; ih += stride_h) {
        for (; ih+equ_window_h <= IH+pad_h; ih += stride_h) {
            int iw = -pad_w;
            for (; iw+window_w <= IW+pad_w; iw += stride_w) {
            for (; iw+equ_window_w <= IW+pad_w; iw += stride_w) {
                for (int kh = 0; kh < window_h; ++kh)
                for (int kw = 0; kw < window_w; ++kw)
                {
                    int ih2 = ih+kh, iw2 = iw+kw;
                    int ih2 = ih+dilate_h*kh, iw2 = iw+dilate_w*kw;
                    if (ih2 >= 0 && ih2 < IH && iw2 >= 0 && iw2 < IW) {
                        sptr[n*C*IH*IW + c*IH*IW + ih2*IW + iw2] +=
                            dptr[idx*window_h*window_w + kh*window_w + kw];
--- a/dnn/test/common/images2neibs.h
+++ b/dnn/test/common/images2neibs.h
@@ -31,17 +31,19 @@ inline std::vector<TestArg> get_args() {
    for (uint32_t pw : {0, 1})
    for (uint32_t sh : {1, 2})
    for (uint32_t sw : {1, 2})
    for (uint32_t dh : {1, 2, 3})
    for (uint32_t dw : {1, 2, 3})
    for (uint32_t wh : {3, 4})
    for (uint32_t ww : {3, 4}) {
        args.emplace_back(param::Images2Neibs{ph, pw, sh, sw, wh, ww},
                          TensorShape{2, 3, 5, 6});
        args.emplace_back(param::Images2Neibs{ph, pw, sh, sw, dh, dw, wh, ww},
                          TensorShape{2, 3, 19, 20});
    }
    // clang-format on
    // large window case
    args.emplace_back(param::Images2Neibs{0, 0, 1, 1, 32, 64},
    args.emplace_back(param::Images2Neibs{0, 0, 1, 1, 1, 1, 32, 64},
                      TensorShape{2, 3, 96, 128});
    // large size
    args.emplace_back(param::Images2Neibs{0, 0, 1, 1, 1, 1},
    args.emplace_back(param::Images2Neibs{0, 0, 1, 1, 1, 1, 1, 1},
                      TensorShape{128, 128, 28, 24});

    return args;
@@ -54,17 +56,19 @@ inline std::vector<TestArg> get_benchmark_args() {
    for (uint32_t pw : {0, 1})
    for (uint32_t sh : {1, 2})
    for (uint32_t sw : {1, 2})
    for (uint32_t dh : {1, 2})
    for (uint32_t dw : {1, 2})
    for (uint32_t wh : {3, 4})
    for (uint32_t ww : {3, 4})
    for (uint32_t b : {1, 64})
    for (uint32_t c : {64, 128})
    for (uint32_t hw : {64, 128}) {
        args.emplace_back(param::Images2Neibs{ph, pw, sh, sw, wh, ww},
        args.emplace_back(param::Images2Neibs{ph, pw, sh, sw, dh, dw, wh, ww},
                          TensorShape{b, c, hw, hw});
    }
    // clang-format on
    // large size
    args.emplace_back(param::Images2Neibs{0, 0, 1, 1, 1, 1},
    args.emplace_back(param::Images2Neibs{0, 0, 1, 1, 1, 1, 1, 1},
                      TensorShape{1024, 128, 28, 24});

    return args;
--- a/dnn/test/naive/images2neibs.cpp
+++ b/dnn/test/naive/images2neibs.cpp
@@ -0,0 +1,59 @@
 /**
 * \file dnn/test/naive/images2neibs.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #include "test/naive/fixture.h"

 #include "megdnn/oprs/nn.h"
 #include "test/common/checker.h"

 using namespace megdnn;
 using namespace test;

 TEST_F(NAIVE, IMAGES2NEIBS_FORWARD) {
    Checker<Images2Neibs> checker(handle(), /* check_dispatch */false);
    
    Images2Neibs::Param param(0,0,1,1,1,1,2,2);
    checker.set_param(param).exect(
            Testcase{TensorValue({1, 1, 3, 3}, dtype::Uint8(),
                               {0,1,2,
                               3,4,5,
                               6,7,8}), {}}, 
            Testcase{{},
            TensorValue({1, 1, 2, 2, 2, 2}, dtype::Uint8(), 
                        {0,1,3,4,
                        1,2,4,5,
                        3,4,6,7,
                        4,5,7,8})});
    
    param.pad_h = 1;
    param.pad_w = 1;
    param.stride_h = 2;
    param.stride_w = 2;
    param.dilate_h = 2;
    param.dilate_w = 2;
    param.window_h = 3;
    param.window_w = 3;
    checker.set_param(param).exect(
          Testcase{TensorValue({1, 1, 6, 7}, dtype::Uint8(),
                               {0,1,2,3,4,5,6,
                               7,8,9,10,11,12,13,
                               14,15,16,17,18,19,20,
                               21,22,23,24,25,26,27,
                               28,29,30,31,32,33,34,
                               35,36,37,38,39,40,41}), {}}, 
          Testcase{{},
          TensorValue({1, 1, 2, 3, 3, 3}, dtype::Uint8(),
                        {0,0,0,0,8,10,0,22,24,
                        0,0,0,8,10,12,22,24,26,
                        0,0,0,10,12,0,24,26,0,
                        0,8,10,0,22,24,0,36,38,
                        8,10,12,22,24,26,36,38,40,
                        10,12,0,24,26,0,38,40,0})});
 }
--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
@@ -70,6 +70,7 @@ __all__ = [
    "remap",
    "resize",
    "sigmoid",
    "sliding_window",
    "softmax",
    "softplus",
    "sync_batch_norm",
@@ -1353,6 +1354,44 @@ def indexing_one_hot(
    return result


 def sliding_window(
    inp: Tensor,
    kernel_size: Union[int, Tuple[int, int]],
    padding: Union[int, Tuple[int, int]] = 0,
    stride: Union[int, Tuple[int, int]] = 1,
    dilation: Union[int, Tuple[int, int]] = 1,
 ) -> Tensor:
    """
    Extracts sliding local blocks from a batched input tensor.

    Refer to :class:`~.SlidingWindow` for more information.

    :param inp: input tensor.
    :param kernel_size: size of the window.
    :param padding: implicit zero padding added on both sides of input. Default: 0
    :param stride: stride of the window. Default: 1
    :param dilation: dilation of the window. Default: 1
    :return: output tensor.
    """
    padding_h, padding_w = _pair(padding)
    stride_h, stride_w = _pair_nonzero(stride)
    dilation_h, dilation_w = _pair_nonzero(dilation)
    window_h, window_w = _pair_nonzero(kernel_size)

    op = builtin.Images2Neibs(
        pad_h=padding_h,
        pad_w=padding_w,
        stride_h=stride_h,
        stride_w=stride_w,
        dilate_h=dilation_h,
        dilate_w=dilation_w,
        window_h=window_h,
        window_w=window_w,
    )
    (output,) = apply(op, inp)
    return output


 interpolate = deprecated_func("1.3", "megengine.functional.vision", "interpolate", True)
 roi_pooling = deprecated_func("1.3", "megengine.functional.vision", "roi_pooling", True)
 roi_align = deprecated_func("1.3", "megengine.functional.vision", "roi_align", True)
--- a/imperative/python/megengine/module/init.py
+++ b/imperative/python/megengine/module/init.py
@@ -34,3 +34,4 @@ from .normalization import GroupNorm, InstanceNorm, LayerNorm
 from .pooling import AvgPool2d, MaxPool2d
 from .quant_dequant import DequantStub, QuantStub
 from .sequential import Sequential
 from .sliding_window import SlidingWindow
--- a/imperative/python/megengine/module/sliding_window.py
+++ b/imperative/python/megengine/module/sliding_window.py
@@ -0,0 +1,88 @@
 # -*- coding: utf-8 -*-
 # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 #
 # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 from typing import Tuple, Union

 from ..functional import sliding_window
 from .module import Module


 class SlidingWindow(Module):
    r"""
    Apply a sliding window to input tensor and copy content in the window to
    corresponding output location. Assume input shape is :math:`(N, C, IH, IW)`,
    then output shape would be :math:`(N, C, OH, OW, window_h, window_w)` where
    :math:`(OH, OW)` would be computed from padding, stride, window and
    :math:`(IH, IW)`, as in convolution. For each output location, we have;

    .. math::

        out_{n, c, oh, ow, wh, ww} &= src_{n, c, ih+wh, iw+ww} \\
        \text{where } & ih=-pad_h+oh \times stride_h + (wh-1) \times (dilation_h-1) \\
                       & iw=-pad_w+ow \times stride_w + (ww-1) \times (dilation_w-1)


    :param kernel_size: the size of the window to take a max over.
    :param padding: implicit zero padding to be added on both sides. Default: 0
    :param stride: the stride of the window. Default: 1
    :param dilation: the dilation of the window. Default: 1

    Example:

    .. testcode::

        from megengine import tensor
        import megengine.module as M
        import numpy as np

        inp = tensor(np.arange(30).reshape(1,1,5,6))
        op = M.SlidingWindow(kernel_size=3, padding=1, stride=2, dilation=2)
        out = op(inp)
        print(out.numpy())

    Outputs:

    .. testoutput::

        [[[[[[ 0  0  0]
             [ 0  7  9]
             [ 0 19 21]]

            [[ 0  0  0]
             [ 7  9 11]
             [19 21 23]]]


           [[[ 0  7  9]
             [ 0 19 21]
             [ 0  0  0]]

            [[ 7  9 11]
             [19 21 23]
             [ 0  0  0]]]]]]

    """

    def __init__(
        self,
        kernel_size: Union[int, Tuple[int, int]],
        padding: Union[int, Tuple[int, int]] = 0,
        stride: Union[int, Tuple[int, int]] = 1,
        dilation: Union[int, Tuple[int, int]] = 1,
        **kwargs
    ):
        super(SlidingWindow, self).__init__(**kwargs)
        self.kernel_size = kernel_size
        self.padding = padding
        self.stride = stride
        self.dilation = dilation

    def forward(self, inp):
        return sliding_window(
            inp, self.kernel_size, self.padding, self.stride, self.dilation
        )
--- a/imperative/python/test/unit/functional/test_functional.py
+++ b/imperative/python/test/unit/functional/test_functional.py
@@ -927,3 +927,28 @@ def test_neg_axis():
    y = F.argmin(x, axis=(-1, -2))
    yy = F.argmin(x, axis=(0, 1))
    np.testing.assert_equal(y.numpy(), yy.numpy())


 def test_sliding_window():
    N, C, H, W = 2, 3, 7, 8
    inp = np.random.normal(size=(N, C, H, W))
    ph, pw = 1, 2
    sh, sw = 2, 1
    wh, ww = 3, 2
    dh, dw = 1, 3
    s = lambda i, p, s, d, w: (i + p * 2 - (w - 1) * d - 1) // s + 1
    inp_pad = np.zeros((N, C, H + ph * 2, W + pw * 2))
    inp_pad[:, :, ph : H + ph, pw : W + pw] = inp
    gt_out = np.empty(
        (N, C, s(H, ph, sh, dh, wh), s(W, pw, sw, dw, ww), wh, ww), dtype=np.float32
    )
    for n, c, oh, ow in itertools.product(*map(range, gt_out.shape[:4])):
        ih, iw = oh * sh, ow * sw
        gt_out[n, c, oh, ow, :] = inp_pad[
            n, c, ih : ih + (wh - 1) * dh + 1 : dh, iw : iw + (ww - 1) * dw + 1 : dw
        ]

    out = F.sliding_window(
        tensor(inp), (wh, ww), padding=(ph, pw), stride=(sh, sw), dilation=(dh, dw)
    )
    np.testing.assert_equal(gt_out, out.numpy())
--- a/imperative/src/impl/ops/specializations.cpp
+++ b/imperative/src/impl/ops/specializations.cpp
@@ -32,6 +32,7 @@
 #include "megbrain/opr/tensor_gen.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/opr/utility.h"
 #include "megbrain/opr/dnn/images2neibs.h"

 #include "../op_trait.h"

@@ -652,4 +653,17 @@ OP_TRAIT_REG(SVD, SVD)
    .fallback();
 }} // svd

 namespace { namespace images2neibs {
 auto apply_on_var_node(
        const OpDef& def,
        const VarNodeArray& inputs) {
    auto&& op = static_cast<const Images2Neibs&>(def);
    OperatorNodeConfig config{op.make_name()};
    return opr::Images2Neibs::make(inputs[0], op.param(), config);
 }
 OP_TRAIT_REG(Images2Neibs, Images2Neibs)
    .apply_on_var_node(apply_on_var_node)
    .fallback();
 }} // images2neibs

 } // namespace mgb::imperative
--- a/src/core/include/megbrain/ir/ops.td
+++ b/src/core/include/megbrain/ir/ops.td
@@ -79,6 +79,8 @@ def BatchConvBias : MgbHashableOp<"BatchConvBias", [BatchConvBiasParam, Executio
  );
 }

 def Images2Neibs : MgbHashableOp<"Images2Neibs", [Images2NeibsParam]>;

 def BatchNorm : MgbHashableOp<"BatchNorm", [BNParam]>;

 def ROIAlign: MgbHashableOp<"ROIAlign", [ROIAlignParam]>;