feat(mge/opr): add interpolate nearest mode

GitOrigin-RevId: d384b87f50
4 years ago · 0558b2123d
--- a/dnn/include/megdnn/oprs/cv.h
+++ b/dnn/include/megdnn/oprs/cv.h
@@ -198,6 +198,9 @@ public:
 protected:
    //! get origin coord
    std::pair<float, int> get_origin_coord(float scale, int size, int idx);
    //! get nearest index in src
    int get_nearest_src(float scale, int size, int idx);
    void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst);
 };
--- a/dnn/src/common/resize.cpp
+++ b/dnn/src/common/resize.cpp
@@ -6,9 +6,11 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #include "megdnn/handle.h"
 #include "megdnn/oprs.h"
 #include "src/common/utils.h"
@@ -26,8 +28,9 @@ void ResizeBase::check_layout_fwd(const TensorLayout& src,
                  errmsg().c_str());
    if (param().format == Param::Format::NCHW) {
        megdnn_assert(dst.shape[1] == src.shape[1], "%s", errmsg().c_str());
        megdnn_assert(param().imode ==
                      param::Resize::InterpolationMode::INTER_LINEAR);
        auto imode = param().imode;
        megdnn_assert(imode == param::Resize::InterpolationMode::INTER_LINEAR ||
                      imode == param::Resize::InterpolationMode::NEAREST);
    } else if (param().format == Param::Format::NHWC) {
        megdnn_assert(dst.shape[3] == src.shape[3], "%s", errmsg().c_str());
    } else if (param().format == Param::Format::NCHW4) {
@@ -79,6 +82,9 @@ std::pair<float, int> ResizeBase::get_origin_coord(float scale, int size,
    return {alpha, origin_idx};
 }
 int ResizeBase::get_nearest_src(float scale, int size, int idx) {
    return std::min(static_cast<int>(idx / scale), size - 1);
 }
 }  // namespace megdnn
 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/resize/backward.cpp
+++ b/dnn/src/cuda/resize/backward.cpp
@@ -30,8 +30,9 @@ void ResizeBackwardImpl::exec(_megdnn_tensor_in diff, _megdnn_tensor_out grad,
    size_t max_batch_size = max_batch_x_channel / C;
    while (N > 0) {
        size_t curr_batch_size = N > max_batch_size ? max_batch_size : N;
        resize::backward_data_proxy(diff_ptr, grad_ptr, curr_batch_size, C, IH,
                                    IW, OH, OW, stream);
        resize::backward_data_proxy(resize::get_imode(param().imode), diff_ptr,
                                    grad_ptr, curr_batch_size, C, IH, IW, OH,
                                    OW, stream);
        if (N <= max_batch_size) {
            break;
--- a/dnn/src/cuda/resize/backward.cu
+++ b/dnn/src/cuda/resize/backward.cu
@@ -17,9 +17,9 @@ namespace megdnn {
 namespace cuda {
 namespace resize {
 __global__ void resize_bwd_kernel(const float* hidden, float* dst, int N, int C,
                                  int IH, int IW, int OH, int OW, float scale_h,
                                  float scale_w) {
 __global__ void resize_bwd_linear_kernel(const float* hidden, float* dst, int N,
                                         int C, int IH, int IW, int OH, int OW,
                                         float scale_h, float scale_w) {
    int n = blockIdx.z;
    int ow = blockIdx.x * blockDim.x + threadIdx.x;
    int oh = blockIdx.y * blockDim.y + threadIdx.y;
@@ -51,8 +51,30 @@ __global__ void resize_bwd_kernel(const float* hidden, float* dst, int N, int C,
    }
 }
 void backward_data_proxy(const float* diff, float* grad, int N, int C, int IH,
                         int IW, int OH, int OW, cudaStream_t stream) {
 __global__ void resize_bwd_nearest_kernel(const float* hidden, float* dst,
                                          int N, int C, int IH, int IW, int OH,
                                          int OW, float scale_h,
                                          float scale_w) {
    int n = blockIdx.z;
    int ow = blockIdx.x * blockDim.x + threadIdx.x;
    int oh = blockIdx.y * blockDim.y + threadIdx.y;
    hidden += n * C * OH * OW;
    dst += n * C * IH * IW;
    if (ow < OW && oh < OH) {
        int ih = get_nearest_src(scale_h, IH, oh);
        int iw = get_nearest_src(scale_w, IW, ow);
        for (int c = 0; c < C; ++c) {
            atomicAdd(dst + ih * IW + iw,
                      hidden[oh * OW + ow]);
            hidden += OH * OW;
            dst += IH * IW;
        }
    }
 }
 void backward_data_proxy(InterpolationMode imode, const float* diff,
                         float* grad, int N, int C, int IH, int IW, int OH,
                         int OW, cudaStream_t stream) {
    const int BY = 16, BX = 32;
    {
        dim3 threads(BX, BY);
@@ -61,8 +83,14 @@ void backward_data_proxy(const float* diff, float* grad, int N, int C, int IH,
                                   stream));
        float scale_h = static_cast<float>(OH) / IH;
        float scale_w = static_cast<float>(OW) / IW;
        resize_bwd_kernel<<<blocks, threads, 0, stream>>>(
                diff, grad, N, C, IH, IW, OH, OW, scale_h, scale_w);
        if(imode == InterpolationMode::INTER_LINEAR) {
            resize_bwd_linear_kernel<<<blocks, threads, 0, stream>>>(
                    diff, grad, N, C, IH, IW, OH, OW, scale_h, scale_w);
        }
        else if (imode == InterpolationMode::INTER_NEAREST) {
            resize_bwd_nearest_kernel<<<blocks, threads, 0, stream>>>(
                    diff, grad, N, C, IH, IW, OH, OW, scale_h, scale_w);
        }
    }
    after_kernel_launch();
 }
--- a/dnn/src/cuda/resize/common.cuh
+++ b/dnn/src/cuda/resize/common.cuh
@@ -28,6 +28,10 @@ __device__ inline void get_origin_coord(float scale, int size, int idx,
    }
 }
 __device__ inline int get_nearest_src(float scale, int size, int idx) {
    return min(static_cast<int>(idx / scale), size - 1);
 }
 }  // namespace resize
 }  // namespace cuda
 }  // namespace megdnn
--- a/dnn/src/cuda/resize/common.h
+++ b/dnn/src/cuda/resize/common.h
@@ -20,16 +20,17 @@ namespace resize {
 // all these kernels use bilinear interpolation
 template <typename ctype>
 void forward_proxy(bool is_nhwc, const ctype* src, ctype* dst, int N, int C,
                   int IH, int IW, int OH, int OW, int S_IN, int S_IC, int S_IH,
                   int S_IW, cudaStream_t stream);
 void forward_proxy(bool is_nhwc, InterpolationMode imode, const ctype* src,
                   ctype* dst, int N, int C, int IH, int IW, int OH, int OW,
                   int S_IN, int S_IC, int S_IH, int S_IW, cudaStream_t stream);
 template <typename ctype>
 void forward_proxy_nchw4(const ctype* src, ctype* dst, int N, int C, int IH,
                         int IW, int OH, int OW, cudaStream_t stream);
 void backward_data_proxy(const float* diff, float* grad, int N, int C, int IH,
                         int IW, int OH, int OW, cudaStream_t stream);
 void backward_data_proxy(InterpolationMode imode, const float* diff,
                         float* grad, int N, int C, int IH, int IW, int OH,
                         int OW, cudaStream_t stream);
 }  // namespace resize
 }  // namespace cuda
--- a/dnn/src/cuda/resize/forward.cpp
+++ b/dnn/src/cuda/resize/forward.cpp
@@ -9,6 +9,7 @@
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #include "src/common/cv/common.h"
 #include "src/common/cv/enums.h"
 #include "src/cuda/handle.h"
 #include "src/cuda/resize/common.h"
 #include "src/cuda/resize/helper.h"
@@ -146,19 +147,23 @@ void ResizeImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in dst,
                                    C, IH, IW, OH, OW, stream);
        return;
    }
    megdnn_assert(param().imode == Param::InterpolationMode::LINEAR,
    megdnn_assert(param().imode == Param::InterpolationMode::LINEAR ||
                          param().imode == Param::InterpolationMode::NEAREST,
                  "unsupported interpolation mode for NCHW format");
    if (src.layout.dtype == dtype::Float32{}) {
        resize::forward_proxy(is_nhwc, src.ptr<dt_float32>(),
                              dst.ptr<dt_float32>(), src.layout[0], C, IH, IW,
                              OH, OW, S_IN, S_IC, S_IH, S_IW, stream);
        resize::forward_proxy(is_nhwc, resize::get_imode((param().imode)),
                              src.ptr<dt_float32>(), dst.ptr<dt_float32>(),
                              src.layout[0], C, IH, IW, OH, OW, S_IN, S_IC,
                              S_IH, S_IW, stream);
    } else if (src.layout.dtype == dtype::Uint8()) {
        resize::forward_proxy(is_nhwc, src.ptr<dt_uint8>(), dst.ptr<dt_uint8>(),
        resize::forward_proxy(is_nhwc, resize::get_imode((param().imode)),
                              src.ptr<dt_uint8>(), dst.ptr<dt_uint8>(),
                              src.layout[0], C, IH, IW, OH, OW, S_IN, S_IC,
                              S_IH, S_IW, stream);
    } else if (src.layout.dtype == dtype::Int8()) {
        resize::forward_proxy(is_nhwc, src.ptr<dt_int8>(), dst.ptr<dt_int8>(),
        resize::forward_proxy(is_nhwc, resize::get_imode((param().imode)),
                              src.ptr<dt_int8>(), dst.ptr<dt_int8>(),
                              src.layout[0], C, IH, IW, OH, OW, S_IN, S_IC,
                              S_IH, S_IW, stream);
    } else {
--- a/dnn/src/cuda/resize/forward.cu
+++ b/dnn/src/cuda/resize/forward.cu
@@ -32,9 +32,10 @@ struct DirectSrcVisitor {
 };
 template <typename ctype, typename SrcVisitor, typename OutputConverter>
 __global__ void kern_general(SrcVisitor src, ctype* __restrict dst, int C,
                             int IH, int IW, int OH, int OW, int S_IN, int S_IC,
                             int S_IH, int S_IW, float scale_h, float scale_w) {
 __global__ void kern_general_linear(SrcVisitor src, ctype* __restrict dst,
                                    int C, int IH, int IW, int OH, int OW,
                                    int S_IN, int S_IC, int S_IH, int S_IW,
                                    float scale_h, float scale_w) {
    OutputConverter output_converter;
    int ow = blockIdx.x * blockDim.x + threadIdx.x;
    int oh = blockIdx.y * blockDim.y + threadIdx.y;
@@ -64,6 +65,31 @@ __global__ void kern_general(SrcVisitor src, ctype* __restrict dst, int C,
    }
 }
 template <typename ctype, typename SrcVisitor, typename OutputConverter>
 __global__ void kern_general_nearest(SrcVisitor src, ctype* __restrict dst,
                                     int C, int IH, int IW, int OH, int OW,
                                     int S_IN, int S_IC, int S_IH, int S_IW,
                                     float scale_h, float scale_w) {
    OutputConverter output_converter;
    int ow = blockIdx.x * blockDim.x + threadIdx.x;
    int oh = blockIdx.y * blockDim.y + threadIdx.y;
    const ctype* __restrict sptr = src.get(blockIdx.z, S_IN);
    dst += blockIdx.z * C * OH * OW;
    if (ow < OW && oh < OH) {
        int ih = get_nearest_src(scale_h, IH, oh);
        int iw = get_nearest_src(scale_w, IW, ow);
        for (int c = 0; c < C; ++c) {
            dst[oh * OW + ow] = output_converter(
                    sptr[ih * S_IH + iw * S_IW]);
            sptr += S_IC;
            dst += OH * OW;
        }
    }
 }
 template <typename ctype, typename SrcVisitor, typename OutputConverter>
 __global__ void kern_general_nhwc(SrcVisitor src, ctype* __restrict dst, int C,
                                  int IH, int IW, int OH, int OW, float scale_h,
@@ -94,9 +120,10 @@ __global__ void kern_general_nhwc(SrcVisitor src, ctype* __restrict dst, int C,
 }
 template <typename ctype, typename SrcVisitor>
 void dispatch_with_visitor(bool is_nhwc, SrcVisitor src, ctype* dst, int N,
                           int C, int IH, int IW, int OH, int OW, int S_IN,
                           int S_IC, int S_IH, int S_IW, cudaStream_t stream) {
 void dispatch_with_visitor(bool is_nhwc, InterpolationMode imode,
                           SrcVisitor src, ctype* dst, int N, int C, int IH,
                           int IW, int OH, int OW, int S_IN, int S_IC, int S_IH,
                           int S_IW, cudaStream_t stream) {
    const int BY = 16, BX = 32;
    const int max_batch_size = 65535;
@@ -113,10 +140,19 @@ void dispatch_with_visitor(bool is_nhwc, SrcVisitor src, ctype* dst, int N,
                    <<<blocks, threads, 0, stream>>>(src, dst, C, IH, IW, OH,
                                                     OW, scale_h, scale_w);
        } else {
            kern_general<ctype, SrcVisitor, rounding::RoundingConverter<ctype>>
                    <<<blocks, threads, 0, stream>>>(src, dst, C, IH, IW, OH,
                                                     OW, S_IN, S_IC, S_IH, S_IW,
                                                     scale_h, scale_w);
            if (imode == InterpolationMode::INTER_LINEAR) {
                kern_general_linear<ctype, SrcVisitor,
                                    rounding::RoundingConverter<ctype>>
                        <<<blocks, threads, 0, stream>>>(
                                src, dst, C, IH, IW, OH, OW, S_IN, S_IC, S_IH,
                                S_IW, scale_h, scale_w);
            } else if (imode == InterpolationMode::INTER_NEAREST) {
                kern_general_nearest<ctype, SrcVisitor,
                                     rounding::RoundingConverter<ctype>>
                        <<<blocks, threads, 0, stream>>>(
                                src, dst, C, IH, IW, OH, OW, S_IN, S_IC, S_IH,
                                S_IW, scale_h, scale_w);
            }
        }
        N -= curr_batch_size;
        src.move_batch(curr_batch_size, C * IH * IW);
@@ -194,13 +230,14 @@ namespace cuda {
 namespace resize {
 template <typename ctype>
 void forward_proxy(bool is_nhwc, const ctype* src, ctype* dst, int N, int C,
                   int IH, int IW, int OH, int OW, int S_IN, int S_IC, int S_IH,
                   int S_IW, cudaStream_t stream) {
 void forward_proxy(bool is_nhwc, InterpolationMode imode, const ctype* src,
                   ctype* dst, int N, int C, int IH, int IW, int OH, int OW,
                   int S_IN, int S_IC, int S_IH, int S_IW,
                   cudaStream_t stream) {
    DirectSrcVisitor<ctype> visitor;
    visitor.ptr = src;
    dispatch_with_visitor(is_nhwc, visitor, dst, N, C, IH, IW, OH, OW, S_IN,
                          S_IC, S_IH, S_IW, stream);
    dispatch_with_visitor(is_nhwc, imode, visitor, dst, N, C, IH, IW, OH, OW,
                          S_IN, S_IC, S_IH, S_IW, stream);
    after_kernel_launch();
 }
@@ -214,7 +251,7 @@ void forward_proxy_nchw4(const ctype* src, ctype* dst, int N, int C, int IH,
 }
 #define INST(ctype)                                                        \
    template void forward_proxy(bool, const ctype*, ctype*, int, int, int, \
    template void forward_proxy(bool, InterpolationMode, const ctype*, ctype*, int, int, int, \
                                int, int, int, int, int, int, int,         \
                                cudaStream_t);
 INST(float)
--- a/dnn/src/fallback/resize/opr_impl.cpp
+++ b/dnn/src/fallback/resize/opr_impl.cpp
@@ -116,7 +116,9 @@ void ResizeImpl::kern_fallback_nhwc(const KernParam<ctype>& kern_param) {
 void ResizeImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in dst,
                      _megdnn_workspace workspace) {
    check_exec(src.layout, dst.layout, workspace.size);
    if (param().format == param::Resize::Format::NCHW4) {
    if (param().format == param::Resize::Format::NCHW4 ||
        (param().format == param::Resize::Format::NCHW &&
         param().imode == param::Resize::InterpolationMode::NEAREST)) {
        naive::ResizeImpl::exec(src, dst, workspace);
        return;
    }
--- a/dnn/src/naive/resize/opr_impl.cpp
+++ b/dnn/src/naive/resize/opr_impl.cpp
@@ -10,12 +10,14 @@
 */
 #include "src/common/rounding_converter.cuh"
 #include "src/common/utils.cuh"
 #include "src/naive/handle.h"
 #include "src/naive/resize/opr_impl.h"
 #include "src/naive/resize/resize_cv.h"
 #include "midout.h"
 MIDOUT_DECL(megdnn_naive_resize_layout)
 MIDOUT_DECL(megdnn_naive_resize_layout_nearest)
 using namespace megdnn;
 using namespace naive;
@@ -86,6 +88,28 @@ INST(dt_qint8);
 INST(dt_quint8);
 #undef INST
 template <typename ctype>
 void ResizeImpl::kern_nchw_nearest (const KernParam<ctype>& kern_param) {
    megdnn_assert(kern_param.format == Format::NCHW);
    UNPACK_RESIZE_FWD_KERN_PARAM_WITH_STRIDE(kern_param);
    float scale_h = static_cast<float>(OH) / IH;
    float scale_w = static_cast<float>(OW) / IW;
    rep(n, N) {
        rep(oh, OH) rep(ow, OW) {
            auto ih = get_nearest_src(scale_h, IH, oh);
            auto iw = get_nearest_src(scale_w, IW, ow);
            rep(c, static_cast<int>(C)) {
                dptr[c * OH * OW + oh * OW + ow] = sptr[c * S_IC + ih * S_IH + iw * S_IW];
            }
        }
        sptr += S_IN;
        dptr += C * OH * OW;
    }
 }
 template <typename ctype>
 void ResizeImpl::kern_naive(const KernParam<ctype>& kern_param) {
    if (kern_param.format == Format::NHWC) {
@@ -266,6 +290,39 @@ void ResizeImpl::kern_naive_nchw4(const KernParam<ctype>& kern_param) {
 void ResizeImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in dst,
                      _megdnn_workspace workspace) {
    check_exec(src.layout, dst.layout, workspace.size);
    if (param().format == param::Resize::Format::NCHW &&
        param().imode == param::Resize::InterpolationMode::NEAREST) {
 #define cb(dt, ct, _midout_iv)                                             \
    case DTypeTrait<dt>::enumv: {                                          \
        MIDOUT_BEGIN(megdnn_naive_resize_layout_nearest,                   \
                     midout_iv(_midout_iv)) {                              \
            auto kparam = KernParam<ct>::from_tensors(param().format, src, \
                                                      dst, workspace);     \
            MEGDNN_DISPATCH_CPU_KERN_OPR(kern_nchw_nearest(kparam));       \
        }                                                                  \
        MIDOUT_END();                                                      \
        return;                                                            \
    }
        switch (src.layout.dtype.enumv()) {
            cb(dtype::Float32, float, 0);
            DNN_INC_FLOAT16(cb(dtype::Float16, dt_float16, 1));
            cb(dtype::Int8, int8_t, 2);
            cb(dtype::QuantizedS8, int8_t, 3);
            cb(dtype::Uint8, uint8_t, 4);
            cb(dtype::Quantized8Asymm, uint8_t, 5);
            default:
                megdnn_throw(ssprintf("Unsupported input DType in Resize "
                                      "NEAREST mode: %s",
                                      src.layout.dtype.name())
                                     .c_str());
                return;
        }
 #undef cb
 #undef cb
    }
    if ((param().format == param::Resize::Format::NCHW ||
         (src.layout[3] != 1 && src.layout[3] != 3) ||
         !is_nhwc_contig_wc(src.layout)) ||
@@ -306,8 +363,8 @@ void ResizeImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in dst,
 void ResizeBackwardImpl::exec(_megdnn_tensor_in diff, _megdnn_tensor_out grad,
                              _megdnn_workspace workspace) {
    check_exec(diff.layout, grad.layout, workspace.size);
    megdnn_assert(param().format == param::WarpPerspective::Format::NCHW,
                  "invalid warp_perspective format");
    megdnn_assert(param().format == param::Resize::Format::NCHW,
                  "invalid resize format");
    const int N = grad.layout.shape[0], C = grad.layout.shape[1],
              IH = grad.layout.shape[2], IW = grad.layout.shape[3];
    const int OH = diff.layout.shape[2], OW = diff.layout.shape[3];
@@ -321,28 +378,37 @@ void ResizeBackwardImpl::exec(_megdnn_tensor_in diff, _megdnn_tensor_out grad,
        std::memset(sptr, 0, sizeof(float) * N * C * IH * IW);
        rep(n, N) {
            rep(oh, OH) rep(ow, OW) {
                auto coord_h = get_origin_coord(scale_h, IH, oh);
                auto coord_w = get_origin_coord(scale_w, IW, ow);
                float alphah = coord_h.first;
                float alphaw = coord_w.first;
                int ih0 = coord_h.second;
                int ih1 = ih0 + 1;
                int iw0 = coord_w.second;
                int iw1 = iw0 + 1;
                rep(c, C) {
                    float hidden = hptr[c * OH * OW + oh * OW + ow];
                    sptr[c * IH * IW + ih0 * IW + iw0] +=
                            (1.0f - alphaw) * (1.0f - alphah) * hidden;
                    sptr[c * IH * IW + ih1 * IW + iw0] +=
                            (1.0f - alphaw) * alphah * hidden;
                    sptr[c * IH * IW + ih0 * IW + iw1] +=
                            alphaw * (1.0f - alphah) * hidden;
                    sptr[c * IH * IW + ih1 * IW + iw1] +=
                            alphaw * alphah * hidden;
                if(param().imode == InterpolationMode::INTER_LINEAR) {
                    auto coord_h = get_origin_coord(scale_h, IH, oh);
                    auto coord_w = get_origin_coord(scale_w, IW, ow);
                    float alphah = coord_h.first;
                    float alphaw = coord_w.first;
                    int ih0 = coord_h.second;
                    int ih1 = ih0 + 1;
                    int iw0 = coord_w.second;
                    int iw1 = iw0 + 1;
                    rep(c, C) {
                        float hidden = hptr[c * OH * OW + oh * OW + ow];
                        sptr[c * IH * IW + ih0 * IW + iw0] +=
                                (1.0f - alphaw) * (1.0f - alphah) * hidden;
                        sptr[c * IH * IW + ih1 * IW + iw0] +=
                                (1.0f - alphaw) * alphah * hidden;
                        sptr[c * IH * IW + ih0 * IW + iw1] +=
                                alphaw * (1.0f - alphah) * hidden;
                        sptr[c * IH * IW + ih1 * IW + iw1] +=
                                alphaw * alphah * hidden;
                    }
                } else if (param().imode == InterpolationMode::NEAREST) {
                    auto ih = get_nearest_src(scale_h, IH, oh);
                    auto iw = get_nearest_src(scale_w, IW, ow);
                    rep(c, static_cast<int>(C)) {
                        sptr[c * IH * IW + ih * IW + iw] += hptr[c * OH * OW + oh * OW + ow];
                    }
                }
                else megdnn_throw("unsupported mode in ResizeBackwardImpl");
            }
            sptr += C * IH * IW;
            hptr += C * OH * OW;
--- a/dnn/src/naive/resize/opr_impl.h
+++ b/dnn/src/naive/resize/opr_impl.h
@@ -46,6 +46,9 @@ private:
    template <typename ctype>
    void kern_naive(const KernParam<ctype>& kern_param);
    template <typename ctype>
    void kern_nchw_nearest(const KernParam<ctype>& kern_param);
    template <typename ctype>
    void kern_naive_nhwc(const KernParam<ctype>& kern_param);
--- a/dnn/test/common/resize.h
+++ b/dnn/test/common/resize.h
@@ -18,6 +18,8 @@ namespace megdnn {
 namespace test {
 namespace resize {
 using IMode = param::Resize::InterpolationMode;
 struct TestArg {
    param::Resize param;
    TensorShape src;
@@ -62,17 +64,18 @@ static void set_nchw_args(std::vector<TestArg>& args) {
    args.emplace_back(param, TensorShape{1, 2, 6, 8}, TensorShape{1, 2, 3, 4});
 }
 static inline std::vector<TestArg> get_args() {
 static inline std::vector<TestArg> get_args(IMode imode = IMode::INTER_LINEAR) {
    std::vector<TestArg> args;
    set_nchw_args(args);
    if(imode == IMode::INTER_LINEAR) {
    //! test NHWC with ch != 1 or ch != 3
    param::Resize param;
    param.format = param::Resize::Format::NHWC;
    param.imode = param::Resize::InterpolationMode::LINEAR;
    args.emplace_back(param, TensorShape{2, 2, 3, 4}, TensorShape{2, 4, 6, 4});
    args.emplace_back(param, TensorShape{2, 4, 6, 4}, TensorShape{2, 2, 3, 4});
        param::Resize param;
        param.format = param::Resize::Format::NHWC;
        param.imode = imode;
        args.emplace_back(param, TensorShape{2, 2, 3, 4}, TensorShape{2, 4, 6, 4});
        args.emplace_back(param, TensorShape{2, 4, 6, 4}, TensorShape{2, 2, 3, 4});
    }
    return args;
 }
--- a/dnn/test/cuda/resize.cpp
+++ b/dnn/test/cuda/resize.cpp
@@ -9,6 +9,7 @@
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #include "test/common/resize.h"
 #include "src/common/cv/enums.h"
 #include "test/common/benchmarker.h"
 #include "test/common/checker.h"
 #include "test/cuda/fixture.h"
@@ -42,30 +43,33 @@ TEST_F(CUDA, RESIZE_CV) {
 TEST_F(CUDA, RESIZE_FORWARD) {
    using namespace resize;
    std::vector<TestArg> args = get_args();
    Checker<Resize> checker(handle_cuda());
    for (auto&& arg : args) {
        checker.set_param(arg.param)
                .set_dtype(0, dtype::Uint8())
                .set_dtype(1, dtype::Uint8())
                .execs({arg.src, arg.dst});
    }
    for (auto&& arg : args) {
        checker.set_param(arg.param)
                .set_dtype(0, dtype::Float32())
                .set_dtype(1, dtype::Float32())
                .set_epsilon(1e-3)
                .execs({arg.src, arg.dst});
    }
    for (auto&& arg : args) {
        checker.set_param(arg.param)
                .set_dtype(0, dtype::Int8())
                .set_dtype(1, dtype::Int8())
                .set_epsilon(1e-3)
                .execs({arg.src, arg.dst});
    IMode modes[2] = {IMode::INTER_LINEAR, IMode::NEAREST};
    for (auto imode : modes) {
        std::vector<TestArg> args = get_args(imode);
        Checker<Resize> checker(handle_cuda());
        for (auto&& arg : args) {
            checker.set_param(arg.param)
                    .set_dtype(0, dtype::Uint8())
                    .set_dtype(1, dtype::Uint8())
                    .execs({arg.src, arg.dst});
        }
        for (auto&& arg : args) {
            checker.set_param(arg.param)
                    .set_dtype(0, dtype::Float32())
                    .set_dtype(1, dtype::Float32())
                    .set_epsilon(1e-3)
                    .execs({arg.src, arg.dst});
        }
        for (auto&& arg : args) {
            checker.set_param(arg.param)
                    .set_dtype(0, dtype::Int8())
                    .set_dtype(1, dtype::Int8())
                    .set_epsilon(1e-3)
                    .execs({arg.src, arg.dst});
        }
    }
 }
@@ -84,42 +88,48 @@ TEST_F(CUDA, RESIZE_NCHW4) {
 }
 TEST_F(CUDA, RESIZE_NCHW_WITH_STRIDE) {
    param::Resize param;
    param.format = param::Resize::Format::NCHW;
    param.imode = param::Resize::InterpolationMode::LINEAR;
    Checker<Resize> checker(handle_cuda());
    checker.set_epsilon(1 + 1e-3)
           .set_param(param);
    auto run = [&](TensorShape src_shape, std::vector<ptrdiff_t> src_layout,
                   TensorShape dst_shape, DType dtype) {
        checker.set_dtype(0, dtype)
               .set_dtype(1, dtype)
               .execl({{src_shape, src_layout, dtype}, {dst_shape, dtype}});
    };
    for (DType& dtype : std::vector<DType>{dtype::Float32(), dtype::Uint8(),
                                           dtype::Int8()}) {
        run({2, 3, 4, 4}, {256, 32, 8, 1}, {2, 3, 3, 3}, dtype);
        run({1, 3, 4, 3}, {105, 35, 7, 2}, {1, 3, 5, 5}, dtype);
        run({1, 3, 40, 40}, {25600, 3200, 80, 1}, {1, 3, 30, 30}, dtype);
        run({2, 3, 4, 4}, {-256, 32, -8, 1}, {2, 3, 3, 3}, dtype);
        run({2, 3, 4, 4}, {256, -32, 8, -1}, {2, 3, 3, 3}, dtype);
        run({2, 3, 4, 4}, {-256, -32, -8, -1}, {2, 3, 3, 3}, dtype);
    IMode modes[2] = {IMode::INTER_LINEAR, IMode::NEAREST};
    for (auto imode : modes) {
        param::Resize param;
        param.format = param::Resize::Format::NCHW;
        param.imode = imode;
        Checker<Resize> checker(handle_cuda());
        checker.set_epsilon(1 + 1e-3)
            .set_param(param);
        auto run = [&](TensorShape src_shape, std::vector<ptrdiff_t> src_layout,
                       TensorShape dst_shape, DType dtype) {
            checker.set_dtype(0, dtype)
                   .set_dtype(1, dtype)
                   .execl({{src_shape, src_layout, dtype}, {dst_shape, dtype}});
        };
        for (DType& dtype : std::vector<DType>{dtype::Float32(), dtype::Uint8(),
                                               dtype::Int8()}) {
            run({2, 3, 4, 4}, {256, 32, 8, 1}, {2, 3, 3, 3}, dtype);
            run({1, 3, 4, 3}, {105, 35, 7, 2}, {1, 3, 5, 5}, dtype);
            run({1, 3, 40, 40}, {25600, 3200, 80, 1}, {1, 3, 30, 30}, dtype);
            run({2, 3, 4, 4}, {-256, 32, -8, 1}, {2, 3, 3, 3}, dtype);
            run({2, 3, 4, 4}, {256, -32, 8, -1}, {2, 3, 3, 3}, dtype);
            run({2, 3, 4, 4}, {-256, -32, -8, -1}, {2, 3, 3, 3}, dtype);
        }
    }
 }
 TEST_F(CUDA, RESIZE_BACKWARD) {
    Checker<ResizeBackward> checker(handle_cuda());
    param::Resize param;
    param.format = param::Resize::Format::NCHW;
    param.imode = param::Resize::InterpolationMode::LINEAR;
    checker.set_param(param);
    checker.execs({{2, 3, 4, 5}, {2, 3, 8, 9}});
    checker.execs({{2, 5, 8, 9}, {2, 5, 4, 5}});
    checker.execs({{2, 5, 8, 5}, {2, 5, 4, 9}});
    checker.execs({{2, 5, 4, 9}, {2, 5, 8, 5}});
    IMode modes[2] = {IMode::INTER_LINEAR, IMode::NEAREST};
    for (auto imode : modes) {
        Checker<ResizeBackward> checker(handle_cuda());
        param::Resize param;
        param.format = param::Resize::Format::NCHW;
        param.imode = imode;
        checker.set_param(param);
        checker.execs({{2, 3, 4, 5}, {2, 3, 8, 9}});
        checker.execs({{2, 5, 8, 9}, {2, 5, 4, 5}});
        checker.execs({{2, 5, 8, 5}, {2, 5, 4, 9}});
        checker.execs({{2, 5, 4, 9}, {2, 5, 8, 5}});
    }
 }
 #if MEGDNN_WITH_BENCHMARK
--- a/imperative/python/megengine/functional/vision.py
+++ b/imperative/python/megengine/functional/vision.py
@@ -522,29 +522,13 @@ def interpolate(
        if align_corners is None:
            align_corners = False
    if (
        size is not None
        and scale_factor is None
        and not align_corners
        and mode == "bilinear"
        and inp.ndim in [4, 5]
    ):
        # fastpath for interpolate
        op = builtin.Resize(imode="linear", format="NCHW")
        shape = astensor1d(size, inp, dtype="int32", device=inp.device)
        (result,) = apply(op, inp, shape)
        return result
    if mode == "linear":
        inp = expand_dims(inp, 3)
    if inp.ndim != 4:
        raise ValueError("shape of input tensor must correspond to the operartion mode")
    if size is None:
        if scale_factor is None:
            raise ValueError("scale_factor must not be None when size is None")
    def get_dsize(scale_factor):
        if isinstance(scale_factor, (float, int)):
            scale_factor = float(scale_factor)
            if mode == "linear":
@@ -572,6 +556,13 @@ def interpolate(
            for i in range(2)
        )
        dsize = concat([dsize[0], dsize[1]], axis=0)
        return dsize
    if size is None:
        if scale_factor is None:
            raise ValueError("scale_factor must not be None when size is None")
        dsize = get_dsize(scale_factor)
    else:
        if scale_factor is not None:
            raise ValueError("scale_factor must be None when size is provided")
@@ -583,6 +574,15 @@ def interpolate(
                raise ValueError("under linear mode, size can only be single value")
        dsize = size
    if not align_corners and mode in ("bilinear", "nearest") and inp.ndim in [4, 5]:
        # fastpath for interpolate
        op = builtin.Resize(
            imode="linear" if mode == "bilinear" else "nearest", format="NCHW"
        )
        shape = astensor1d(dsize, inp, dtype="int32", device=inp.device)
        (result,) = apply(op, inp, shape)
        return result
    oh, ow = dsize[0], dsize[1]
    ih, iw = inp.shape[2], inp.shape[3]
@@ -630,15 +630,10 @@ def interpolate(
        if mode == "linear":
            ret = reshape(ret, ret.shape[0:3])
    else:
        # only NHWC format support "cubic" and "nearest" mode
        # only NHWC format support "cubic" mode
        assert mode == "bicubic"
        inp = transpose(inp, (0, 2, 3, 1))
        ret = warp_perspective(
            inp,
            weight,
            dsize,
            format="NHWC",
            interp_mode="cubic" if mode == "bicubic" else mode,
        )
        ret = warp_perspective(inp, weight, dsize, format="NHWC", interp_mode="cubic",)
        ret = transpose(ret, (0, 3, 1, 2))
    return ret