diff --git a/src/layer/arm/interp_arm.cpp b/src/layer/arm/interp_arm.cpp
index 903c4a703..49789cac5 100644
--- a/src/layer/arm/interp_arm.cpp
+++ b/src/layer/arm/interp_arm.cpp
@@ -15,219 +15,247 @@
 #include "interp_arm.h"
 #include <math.h>
 
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
 namespace ncnn {
 
+#include "interp_bilinear.h"
+#include "interp_bicubic.h"
+#include "interp_bilinear_bf16s.h"
+#include "interp_bicubic_bf16s.h"
+
+#if __ARM_NEON
+#include "interp_bilinear_pack4.h"
+#include "interp_bicubic_pack4.h"
+#include "interp_bilinear_pack4_bf16s.h"
+#include "interp_bicubic_pack4_bf16s.h"
+#endif
+
 DEFINE_LAYER_CREATOR(Interp_arm)
 
-static void linear_coeffs(int w, int outw, int* xofs, float* alpha)
+Interp_arm::Interp_arm()
 {
-    double scale = (double)w / outw;
+#if __ARM_NEON
+    support_packing = true;
+#endif // __ARM_NEON
 
-    for (int dx = 0; dx < outw; dx++)
-    {
-        float fx = (float)((dx + 0.5) * scale - 0.5);
-        int sx = floor(fx);
-        fx -= sx;
+    support_bf16_storage = true;
+}
 
-        if (sx < 0)
-        {
-            sx = 0;
-            fx = 0.f;
-        }
-        if (sx >= w - 1)
-        {
-            sx = w - 2;
-            fx = 1.f;
-        }
+int Interp_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    if (opt.use_bf16_storage)
+        return forward_bf16s(bottom_blob, top_blob, opt);
 
-        xofs[dx] = sx;
+    int h = bottom_blob.h;
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
 
-        alpha[dx*2    ] = 1.f - fx;
-        alpha[dx*2 + 1] = fx;
+    if (dims == 1)
+    {
+        return Interp::forward(bottom_blob, top_blob, opt);
     }
-}
-
-static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
-{
-    int w = dst.w;
-    int h = dst.h;
 
-    // loop body
-    Mat rowsbuf0(w);
-    Mat rowsbuf1(w);
-    float* rows0 = rowsbuf0;
-    float* rows1 = rowsbuf1;
+    int outh = output_height;
+    int outw = output_width;
 
-    int prev_sy1 = -2;
+    if (outh == 0 || outw == 0)
+    {
+        outh = h * height_scale;
+        outw = w * width_scale;
+    }
 
-    for (int dy = 0; dy < h; dy++ )
+    if (outh == h && outw == w)
     {
-        int sy = yofs[dy];
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
 
-        if (sy == prev_sy1)
-        {
-            // reuse all rows
-        }
-        else if (sy == prev_sy1 + 1)
-        {
-            // hresize one row
-            float* rows0_old = rows0;
-            rows0 = rows1;
-            rows1 = rows0_old;
-            const float* S1 = src.row(sy+1);
-
-            const float* alphap = alpha;
-            float* rows1p = rows1;
-            int dx = 0;
 #if __ARM_NEON
-            for ( ; dx+1 < w; dx += 2 )
+    if (elempack == 4)
+    {
+        if (resize_type == 1)// nearest
+        {
+            const float hs = output_height ? h / (float)output_height : 1.f / height_scale;
+            const float ws = output_width ? w / (float)output_width : 1.f / width_scale;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
             {
-                int sx = xofs[dx];
-                int sxn = xofs[dx+1];
-                const float* S1p = S1 + sx;
-                const float* S1np = S1 + sxn;
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
 
-                float32x4_t _a = vld1q_f32(alphap);
-                float32x2_t _S1 = vld1_f32(S1p);
-                float32x2_t _S1n = vld1_f32(S1np);
+                for (int y = 0; y < outh; y++)
+                {
+                    int in_y = std::min((int) (y * hs), (h - 1));
 
-                float32x4_t _S1S1n = vcombine_f32(_S1, _S1n);
-                float32x4_t _ms1 = vmulq_f32(_S1S1n, _a);
-                float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));
+                    const float* ptr = src.row(in_y);
+                    float* outptr = dst.row(y);
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int in_x = std::min((int) (x * ws), (w - 1));
 
-                vst1_f32(rows1p + dx, _rows1);
+                        float32x4_t _p = vld1q_f32(ptr + in_x * 4);
+                        vst1q_f32(outptr, _p);
 
-                alphap += 4;
+                        outptr += 4;
+                    }
+                }
             }
-#endif // __ARM_NEON
-            for ( ; dx < w; dx++ )
-            {
-                int sx = xofs[dx];
-                const float* S1p = S1 + sx;
+        }
+
+        if (resize_type == 2)// bilinear
+        {
+            int* buf = new int[outw + outh + outw*2 + outh*2];
+
+            int* xofs = buf;//new int[outw];
+            int* yofs = buf + outw;//new int[outh];
 
-                float a0 = alphap[0];
-                float a1 = alphap[1];
-                rows1p[dx] = S1p[0]*a0 + S1p[1]*a1;
+            float* alpha = (float*)(buf + outw + outh);//new float[outw * 2];
+            float* beta = (float*)(buf + outw + outh + outw*2);//new float[outh * 2];
 
-                alphap += 2;
+            linear_coeffs(w, outw, xofs, alpha);
+            linear_coeffs(h, outh, yofs, beta);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                resize_bilinear_image_pack4(src, dst, alpha, xofs, beta, yofs);
             }
+
+            delete[] buf;
         }
-        else
+
+        if (resize_type == 3)// bicubic
         {
-            // hresize two rows
-            const float* S0 = src.row(sy);
-            const float* S1 = src.row(sy+1);
-
-            const float* alphap = alpha;
-            float* rows0p = rows0;
-            float* rows1p = rows1;
-            int dx = 0;
-#if __ARM_NEON
-            for ( ; dx+1 < w; dx += 2 )
+            int* buf = new int[outw + outh + outw*4 + outh*4];
+
+            int* xofs = buf;//new int[outw];
+            int* yofs = buf + outw;//new int[outh];
+
+            float* alpha = (float*)(buf + outw + outh);//new float[outw * 4];
+            float* beta = (float*)(buf + outw + outh + outw*4);//new float[outh * 4];
+
+            cubic_coeffs(w, outw, xofs, alpha);
+            cubic_coeffs(h, outh, yofs, beta);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
             {
-                int sx = xofs[dx];
-                int sxn = xofs[dx+1];
-                const float* S0p = S0 + sx;
-                const float* S1p = S1 + sx;
-                const float* S0np = S0 + sxn;
-                const float* S1np = S1 + sxn;
-
-                float32x4_t _a = vld1q_f32(alphap);
-                float32x2_t _S0 = vld1_f32(S0p);
-                float32x2_t _S1 = vld1_f32(S1p);
-                float32x2_t _S0n = vld1_f32(S0np);
-                float32x2_t _S1n = vld1_f32(S1np);
-
-                float32x4_t _S0S0n = vcombine_f32(_S0, _S0n);
-                float32x4_t _S1S1n = vcombine_f32(_S1, _S1n);
-                float32x4_t _ms0 = vmulq_f32(_S0S0n, _a);
-                float32x4_t _ms1 = vmulq_f32(_S1S1n, _a);
-                float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0));
-                float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));
-
-                vst1_f32(rows0p + dx, _rows0);
-                vst1_f32(rows1p + dx, _rows1);
-
-                alphap += 4;
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                resize_bicubic_image_pack4(src, dst, alpha, xofs, beta, yofs);
             }
+
+            delete[] buf;
+        }
+
+        return 0;
+    }
 #endif // __ARM_NEON
-            for ( ; dx < w; dx++ )
-            {
-                int sx = xofs[dx];
-                const float* S0p = S0 + sx;
-                const float* S1p = S1 + sx;
 
-                float a0 = alphap[0];
-                float a1 = alphap[1];
-                rows0p[dx] = S0p[0]*a0 + S0p[1]*a1;
-                rows1p[dx] = S1p[0]*a0 + S1p[1]*a1;
+    if (resize_type == 1)// nearest
+    {
+        const float hs = output_height ? h / (float)output_height : 1.f / height_scale;
+        const float ws = output_width ? w / (float)output_width : 1.f / width_scale;
 
-                alphap += 2;
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            for (int y = 0; y < outh; y++)
+            {
+                int in_y = std::min((int) (y * hs), (h - 1));
+
+                const float* ptr = src.row(in_y);
+                float* outptr = dst.row(y);
+                for (int x = 0; x < outw; x++)
+                {
+                    int in_x = std::min((int) (x * ws), (w - 1));
+                    *outptr++ = ptr[in_x];
+                }
             }
         }
+    }
 
-        prev_sy1 = sy;
+    if (resize_type == 2)// bilinear
+    {
+        int* buf = new int[outw + outh + outw*2 + outh*2];
 
-        // vresize
-        float b0 = beta[0];
-        float b1 = beta[1];
+        int* xofs = buf;//new int[outw];
+        int* yofs = buf + outw;//new int[outh];
 
-        float* rows0p = rows0;
-        float* rows1p = rows1;
-        float* Dp = dst.row(dy);
+        float* alpha = (float*)(buf + outw + outh);//new float[outw * 2];
+        float* beta = (float*)(buf + outw + outh + outw*2);//new float[outh * 2];
 
-#if __ARM_NEON
-        int nn = w >> 3;
-#else
-        int nn = 0;
-#endif
-        int remain = w - (nn << 3);
+        linear_coeffs(w, outw, xofs, alpha);
+        linear_coeffs(h, outh, yofs, beta);
 
-#if __ARM_NEON
-        float32x4_t _b0 = vdupq_n_f32(b0);
-        float32x4_t _b1 = vdupq_n_f32(b1);
-        for (; nn>0; nn--)
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
         {
-            float32x4_t _rows0 = vld1q_f32(rows0p);
-            float32x4_t _rows1 = vld1q_f32(rows1p);
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            resize_bilinear_image(src, dst, alpha, xofs, beta, yofs);
+        }
 
-            float32x4_t _D = vmulq_f32(_rows0, _b0);
-            _D = vmlaq_f32(_D, _rows1, _b1);
+        delete[] buf;
+    }
 
-            vst1q_f32(Dp, _D);
+    if (resize_type == 3)// bicubic
+    {
+        int* buf = new int[outw + outh + outw*4 + outh*4];
 
-            float32x4_t _rows0n = vld1q_f32(rows0p+4);
-            float32x4_t _rows1n = vld1q_f32(rows1p+4);
+        int* xofs = buf;//new int[outw];
+        int* yofs = buf + outw;//new int[outh];
 
-            float32x4_t _Dn = vmulq_f32(_rows0n, _b0);
-            _Dn = vmlaq_f32(_Dn, _rows1n, _b1);
+        float* alpha = (float*)(buf + outw + outh);//new float[outw * 4];
+        float* beta = (float*)(buf + outw + outh + outw*4);//new float[outh * 4];
 
-            vst1q_f32(Dp+4, _Dn);
+        cubic_coeffs(w, outw, xofs, alpha);
+        cubic_coeffs(h, outh, yofs, beta);
 
-            Dp += 8;
-            rows0p += 8;
-            rows1p += 8;
-        }
-#endif // __ARM_NEON
-        for ( ; remain; --remain )
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
         {
-//             D[x] = rows0[x]*b0 + rows1[x]*b1;
-            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1;
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            resize_bicubic_image(src, dst, alpha, xofs, beta, yofs);
         }
 
-        beta += 2;
+        delete[] buf;
     }
+
+    return 0;
 }
 
-int Interp_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+int Interp_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     int h = bottom_blob.h;
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
     int dims = bottom_blob.dims;
     size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
 
-    if (resize_type != 2 || dims == 1)
+    if (dims == 1)
     {
         return Interp::forward(bottom_blob, top_blob, opt);
     }
@@ -247,31 +275,172 @@ int Interp_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt
         return 0;
     }
 
-    top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator);
+    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
-    int* buf = new int[outw + outh + outw*2 + outh*2];
+#if __ARM_NEON
+    if (elempack == 4)
+    {
+        if (resize_type == 1)// nearest
+        {
+            const float hs = output_height ? h / (float)output_height : 1.f / height_scale;
+            const float ws = output_width ? w / (float)output_width : 1.f / width_scale;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                for (int y = 0; y < outh; y++)
+                {
+                    int in_y = std::min((int) (y * hs), (h - 1));
+
+                    const unsigned short* ptr = src.row<const unsigned short>(in_y);
+                    unsigned short* outptr = dst.row<unsigned short>(y);
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int in_x = std::min((int) (x * ws), (w - 1));
+
+                        uint16x4_t _p = vld1_u16(ptr + in_x * 4);
+                        vst1_u16(outptr, _p);
+
+                        outptr += 4;
+                    }
+                }
+            }
+        }
+
+        if (resize_type == 2)// bilinear
+        {
+            int* buf = new int[outw + outh + outw*2 + outh*2];
+
+            int* xofs = buf;//new int[outw];
+            int* yofs = buf + outw;//new int[outh];
+
+            float* alpha = (float*)(buf + outw + outh);//new float[outw * 2];
+            float* beta = (float*)(buf + outw + outh + outw*2);//new float[outh * 2];
+
+            linear_coeffs(w, outw, xofs, alpha);
+            linear_coeffs(h, outh, yofs, beta);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                resize_bilinear_image_pack4_bf16s(src, dst, alpha, xofs, beta, yofs);
+            }
+
+            delete[] buf;
+        }
+
+        if (resize_type == 3)// bicubic
+        {
+            int* buf = new int[outw + outh + outw*4 + outh*4];
+
+            int* xofs = buf;//new int[outw];
+            int* yofs = buf + outw;//new int[outh];
+
+            float* alpha = (float*)(buf + outw + outh);//new float[outw * 4];
+            float* beta = (float*)(buf + outw + outh + outw*4);//new float[outh * 4];
 
-    int* xofs = buf;//new int[outw];
-    int* yofs = buf + outw;//new int[outh];
+            cubic_coeffs(w, outw, xofs, alpha);
+            cubic_coeffs(h, outh, yofs, beta);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
 
-    float* alpha = (float*)(buf + outw + outh);//new float[outw * 2];
-    float* beta = (float*)(buf + outw + outh + outw*2);//new float[outh * 2];
+                resize_bicubic_image_pack4_bf16s(src, dst, alpha, xofs, beta, yofs);
+            }
 
-    linear_coeffs(w, outw, xofs, alpha);
-    linear_coeffs(h, outh, yofs, beta);
+            delete[] buf;
+        }
 
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < channels; q++)
+        return 0;
+    }
+#endif // __ARM_NEON
+
+    if (resize_type == 1)// nearest
     {
-        const Mat src = bottom_blob.channel(q);
-        Mat dst = top_blob.channel(q);
+        const float hs = output_height ? h / (float)output_height : 1.f / height_scale;
+        const float ws = output_width ? w / (float)output_width : 1.f / width_scale;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
 
-        resize_bilinear_image(src, dst, alpha, xofs, beta, yofs);
+            for (int y = 0; y < outh; y++)
+            {
+                int in_y = std::min((int) (y * hs), (h - 1));
+
+                const unsigned short* ptr = src.row<const unsigned short>(in_y);
+                unsigned short* outptr = dst.row<unsigned short>(y);
+                for (int x = 0; x < outw; x++)
+                {
+                    int in_x = std::min((int) (x * ws), (w - 1));
+                    *outptr++ = ptr[in_x];
+                }
+            }
+        }
     }
 
-    delete[] buf;
+    if (resize_type == 2)// bilinear
+    {
+        int* buf = new int[outw + outh + outw*2 + outh*2];
+
+        int* xofs = buf;//new int[outw];
+        int* yofs = buf + outw;//new int[outh];
+
+        float* alpha = (float*)(buf + outw + outh);//new float[outw * 2];
+        float* beta = (float*)(buf + outw + outh + outw*2);//new float[outh * 2];
+
+        linear_coeffs(w, outw, xofs, alpha);
+        linear_coeffs(h, outh, yofs, beta);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            resize_bilinear_image_bf16s(src, dst, alpha, xofs, beta, yofs);
+        }
+
+        delete[] buf;
+    }
+
+    if (resize_type == 3)// bicubic
+    {
+        int* buf = new int[outw + outh + outw*4 + outh*4];
+
+        int* xofs = buf;//new int[outw];
+        int* yofs = buf + outw;//new int[outh];
+
+        float* alpha = (float*)(buf + outw + outh);//new float[outw * 4];
+        float* beta = (float*)(buf + outw + outh + outw*4);//new float[outh * 4];
+
+        cubic_coeffs(w, outw, xofs, alpha);
+        cubic_coeffs(h, outh, yofs, beta);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            resize_bicubic_image_bf16s(src, dst, alpha, xofs, beta, yofs);
+        }
+
+        delete[] buf;
+    }
 
     return 0;
 }
diff --git a/src/layer/arm/interp_arm.h b/src/layer/arm/interp_arm.h
index 3d858f6e3..f2c18b558 100644
--- a/src/layer/arm/interp_arm.h
+++ b/src/layer/arm/interp_arm.h
@@ -22,7 +22,12 @@ namespace ncnn {
 class Interp_arm : virtual public Interp
 {
 public:
+    Interp_arm();
+
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+protected:
+    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/interp_bicubic.h b/src/layer/arm/interp_bicubic.h
new file mode 100644
index 000000000..746103806
--- /dev/null
+++ b/src/layer/arm/interp_bicubic.h
@@ -0,0 +1,252 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static inline void interpolate_cubic(float fx, float* coeffs)
+{
+    const float A = -0.75f;
+
+    float fx0 = fx + 1;
+    float fx1 = fx;
+    float fx2 = 1 - fx;
+    // float fx3 = 2 - fx;
+
+    coeffs[0] = A * fx0*fx0*fx0 - 5*A * fx0*fx0 + 8*A * fx0 - 4*A;
+    coeffs[1] = (A+2) * fx1*fx1*fx1 - (A+3) * fx1*fx1 + 1;
+    coeffs[2] = (A+2) * fx2*fx2*fx2 - (A+3) * fx2*fx2 + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+
+static void cubic_coeffs(int w, int outw, int* xofs, float* alpha)
+{
+    double scale = (double)w / outw;
+
+    for (int dx = 0; dx < outw; dx++)
+    {
+        float fx = (float)((dx + 0.5) * scale - 0.5);
+        int sx = static_cast<int>(floor(fx));
+        fx -= sx;
+
+        interpolate_cubic(fx, alpha + dx*4);
+
+        if (sx <= -1)
+        {
+            sx = 1;
+            alpha[dx*4 +0] = 1.f - alpha[dx*4 +3];
+            alpha[dx*4 +1] = alpha[dx*4 +3];
+            alpha[dx*4 +2] = 0.f;
+            alpha[dx*4 +3] = 0.f;
+        }
+        if (sx == 0)
+        {
+            sx = 1;
+            alpha[dx*4 +0] = alpha[dx*4 +0] + alpha[dx*4 +1];
+            alpha[dx*4 +1] = alpha[dx*4 +2];
+            alpha[dx*4 +2] = alpha[dx*4 +3];
+            alpha[dx*4 +3] = 0.f;
+        }
+        if (sx == w - 2)
+        {
+            sx = w - 3;
+            alpha[dx*4 +3] = alpha[dx*4 +2] + alpha[dx*4 +3];
+            alpha[dx*4 +2] = alpha[dx*4 +1];
+            alpha[dx*4 +1] = alpha[dx*4 +0];
+            alpha[dx*4 +0] = 0.f;
+        }
+        if (sx >= w - 1)
+        {
+            sx = w - 3;
+            alpha[dx*4 +3] = 1.f - alpha[dx*4 +0];
+            alpha[dx*4 +2] = alpha[dx*4 +0];
+            alpha[dx*4 +1] = 0.f;
+            alpha[dx*4 +0] = 0.f;
+        }
+
+        xofs[dx] = sx;
+    }
+}
+
+static void resize_bicubic_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w);
+    Mat rowsbuf1(w);
+    Mat rowsbuf2(w);
+    Mat rowsbuf3(w);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+    float* rows2 = rowsbuf2;
+    float* rows3 = rowsbuf3;
+
+    int prev_sy1 = -3;
+
+    for (int dy = 0; dy < h; dy++ )
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows2;
+            rows2 = rows3;
+            rows3 = rows0_old;
+            const float* S3 = src.row(sy+2);
+
+            const float* alphap = alpha;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows3p[dx] = S3p[-1]*a0 + S3p[0]*a1 + S3p[1]*a2 + S3p[2]*a3;
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 2)
+        {
+            // hresize two rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            rows0 = rows2;
+            rows1 = rows3;
+            rows2 = rows0_old;
+            rows3 = rows1_old;
+            const float* S2 = src.row(sy+1);
+            const float* S3 = src.row(sy+2);
+
+            const float* alphap = alpha;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows2p[dx] = S2p[-1]*a0 + S2p[0]*a1 + S2p[1]*a2 + S2p[2]*a3;
+                rows3p[dx] = S3p[-1]*a0 + S3p[0]*a1 + S3p[1]*a2 + S3p[2]*a3;
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 3)
+        {
+            // hresize three rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            float* rows2_old = rows2;
+            rows0 = rows3;
+            rows1 = rows0_old;
+            rows2 = rows1_old;
+            rows3 = rows2_old;
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy+1);
+            const float* S3 = src.row(sy+2);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows1p[dx] = S1p[-1]*a0 + S1p[0]*a1 + S1p[1]*a2 + S1p[2]*a3;
+                rows2p[dx] = S2p[-1]*a0 + S2p[0]*a1 + S2p[1]*a2 + S2p[2]*a3;
+                rows3p[dx] = S3p[-1]*a0 + S3p[0]*a1 + S3p[1]*a2 + S3p[2]*a3;
+
+                alphap += 4;
+            }
+        }
+        else
+        {
+            // hresize four rows
+            const float* S0 = src.row(sy-1);
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy+1);
+            const float* S3 = src.row(sy+2);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows0p[dx] = S0p[-1]*a0 + S0p[0]*a1 + S0p[1]*a2 + S0p[2]*a3;
+                rows1p[dx] = S1p[-1]*a0 + S1p[0]*a1 + S1p[1]*a2 + S1p[2]*a3;
+                rows2p[dx] = S2p[-1]*a0 + S2p[0]*a1 + S2p[1]*a2 + S2p[2]*a3;
+                rows3p[dx] = S3p[-1]*a0 + S3p[0]*a1 + S3p[1]*a2 + S3p[2]*a3;
+
+                alphap += 4;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        float b0 = beta[0];
+        float b1 = beta[1];
+        float b2 = beta[2];
+        float b3 = beta[3];
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* rows2p = rows2;
+        float* rows3p = rows3;
+        float* Dp = dst.row(dy);
+        for (int dx = 0; dx < w; dx++)
+        {
+//             D[x] = rows0[x]*b0 + rows1[x]*b1 + rows2[x]*b2 + rows3[x]*b3;
+            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3;
+        }
+
+        beta += 4;
+    }
+}
diff --git a/src/layer/arm/interp_bicubic_bf16s.h b/src/layer/arm/interp_bicubic_bf16s.h
new file mode 100644
index 000000000..ed98e4204
--- /dev/null
+++ b/src/layer/arm/interp_bicubic_bf16s.h
@@ -0,0 +1,188 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void resize_bicubic_image_bf16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w);
+    Mat rowsbuf1(w);
+    Mat rowsbuf2(w);
+    Mat rowsbuf3(w);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+    float* rows2 = rowsbuf2;
+    float* rows3 = rowsbuf3;
+
+    int prev_sy1 = -3;
+
+    for (int dy = 0; dy < h; dy++ )
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows2;
+            rows2 = rows3;
+            rows3 = rows0_old;
+            const unsigned short* S3 = src.row<const unsigned short>(sy+2);
+
+            const float* alphap = alpha;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const unsigned short* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows3p[dx] = bfloat16_to_float32(S3p[-1])*a0 + bfloat16_to_float32(S3p[0])*a1 + bfloat16_to_float32(S3p[1])*a2 + bfloat16_to_float32(S3p[2])*a3;
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 2)
+        {
+            // hresize two rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            rows0 = rows2;
+            rows1 = rows3;
+            rows2 = rows0_old;
+            rows3 = rows1_old;
+            const unsigned short* S2 = src.row<const unsigned short>(sy+1);
+            const unsigned short* S3 = src.row<const unsigned short>(sy+2);
+
+            const float* alphap = alpha;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const unsigned short* S2p = S2 + sx;
+                const unsigned short* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows2p[dx] = bfloat16_to_float32(S2p[-1])*a0 + bfloat16_to_float32(S2p[0])*a1 + bfloat16_to_float32(S2p[1])*a2 + bfloat16_to_float32(S2p[2])*a3;
+                rows3p[dx] = bfloat16_to_float32(S3p[-1])*a0 + bfloat16_to_float32(S3p[0])*a1 + bfloat16_to_float32(S3p[1])*a2 + bfloat16_to_float32(S3p[2])*a3;
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 3)
+        {
+            // hresize three rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            float* rows2_old = rows2;
+            rows0 = rows3;
+            rows1 = rows0_old;
+            rows2 = rows1_old;
+            rows3 = rows2_old;
+            const unsigned short* S1 = src.row<const unsigned short>(sy);
+            const unsigned short* S2 = src.row<const unsigned short>(sy+1);
+            const unsigned short* S3 = src.row<const unsigned short>(sy+2);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const unsigned short* S1p = S1 + sx;
+                const unsigned short* S2p = S2 + sx;
+                const unsigned short* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows1p[dx] = bfloat16_to_float32(S1p[-1])*a0 + bfloat16_to_float32(S1p[0])*a1 + bfloat16_to_float32(S1p[1])*a2 + bfloat16_to_float32(S1p[2])*a3;
+                rows2p[dx] = bfloat16_to_float32(S2p[-1])*a0 + bfloat16_to_float32(S2p[0])*a1 + bfloat16_to_float32(S2p[1])*a2 + bfloat16_to_float32(S2p[2])*a3;
+                rows3p[dx] = bfloat16_to_float32(S3p[-1])*a0 + bfloat16_to_float32(S3p[0])*a1 + bfloat16_to_float32(S3p[1])*a2 + bfloat16_to_float32(S3p[2])*a3;
+
+                alphap += 4;
+            }
+        }
+        else
+        {
+            // hresize four rows
+            const unsigned short* S0 = src.row<const unsigned short>(sy-1);
+            const unsigned short* S1 = src.row<const unsigned short>(sy);
+            const unsigned short* S2 = src.row<const unsigned short>(sy+1);
+            const unsigned short* S3 = src.row<const unsigned short>(sy+2);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const unsigned short* S0p = S0 + sx;
+                const unsigned short* S1p = S1 + sx;
+                const unsigned short* S2p = S2 + sx;
+                const unsigned short* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows0p[dx] = bfloat16_to_float32(S0p[-1])*a0 + bfloat16_to_float32(S0p[0])*a1 + bfloat16_to_float32(S0p[1])*a2 + bfloat16_to_float32(S0p[2])*a3;
+                rows1p[dx] = bfloat16_to_float32(S1p[-1])*a0 + bfloat16_to_float32(S1p[0])*a1 + bfloat16_to_float32(S1p[1])*a2 + bfloat16_to_float32(S1p[2])*a3;
+                rows2p[dx] = bfloat16_to_float32(S2p[-1])*a0 + bfloat16_to_float32(S2p[0])*a1 + bfloat16_to_float32(S2p[1])*a2 + bfloat16_to_float32(S2p[2])*a3;
+                rows3p[dx] = bfloat16_to_float32(S3p[-1])*a0 + bfloat16_to_float32(S3p[0])*a1 + bfloat16_to_float32(S3p[1])*a2 + bfloat16_to_float32(S3p[2])*a3;
+
+                alphap += 4;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        float b0 = beta[0];
+        float b1 = beta[1];
+        float b2 = beta[2];
+        float b3 = beta[3];
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* rows2p = rows2;
+        float* rows3p = rows3;
+        unsigned short* Dp = dst.row<unsigned short>(dy);
+        for (int dx = 0; dx < w; dx++)
+        {
+//             D[x] = rows0[x]*b0 + rows1[x]*b1 + rows2[x]*b2 + rows3[x]*b3;
+            *Dp++ = float32_to_bfloat16(*rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3);
+        }
+
+        beta += 4;
+    }
+}
diff --git a/src/layer/arm/interp_bicubic_pack4.h b/src/layer/arm/interp_bicubic_pack4.h
new file mode 100644
index 000000000..3ac71b46f
--- /dev/null
+++ b/src/layer/arm/interp_bicubic_pack4.h
@@ -0,0 +1,272 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void resize_bicubic_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf2(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf3(w, (size_t)4 * 4u, 4);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+    float* rows2 = rowsbuf2;
+    float* rows3 = rowsbuf3;
+
+    int prev_sy1 = -3;
+
+    for (int dy = 0; dy < h; dy++ )
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows2;
+            rows2 = rows3;
+            rows3 = rows0_old;
+            const float* S3 = src.row(sy+2);
+
+            const float* alphap = alpha;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S3p = S3 + sx;
+
+                float32x4_t _a0123 = vld1q_f32(alphap);
+
+                float32x4_t _S30 = vld1q_f32(S3p - 4);
+                float32x4_t _S31 = vld1q_f32(S3p + 0);
+                float32x4_t _S32 = vld1q_f32(S3p + 4);
+                float32x4_t _S33 = vld1q_f32(S3p + 8);
+                float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0);
+                _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1);
+                _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0);
+                _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1);
+                vst1q_f32(rows3p + dx * 4, _rows3);
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 2)
+        {
+            // hresize two rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            rows0 = rows2;
+            rows1 = rows3;
+            rows2 = rows0_old;
+            rows3 = rows1_old;
+            const float* S2 = src.row(sy+1);
+            const float* S3 = src.row(sy+2);
+
+            const float* alphap = alpha;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                float32x4_t _a0123 = vld1q_f32(alphap);
+
+                float32x4_t _S20 = vld1q_f32(S2p - 4);
+                float32x4_t _S21 = vld1q_f32(S2p + 0);
+                float32x4_t _S22 = vld1q_f32(S2p + 4);
+                float32x4_t _S23 = vld1q_f32(S2p + 8);
+                float32x4_t _S30 = vld1q_f32(S3p - 4);
+                float32x4_t _S31 = vld1q_f32(S3p + 0);
+                float32x4_t _S32 = vld1q_f32(S3p + 4);
+                float32x4_t _S33 = vld1q_f32(S3p + 8);
+                float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0);
+                float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0);
+                _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1);
+                _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1);
+                _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0);
+                _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0);
+                _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1);
+                _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1);
+                vst1q_f32(rows2p + dx * 4, _rows2);
+                vst1q_f32(rows3p + dx * 4, _rows3);
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 3)
+        {
+            // hresize three rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            float* rows2_old = rows2;
+            rows0 = rows3;
+            rows1 = rows0_old;
+            rows2 = rows1_old;
+            rows3 = rows2_old;
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy+1);
+            const float* S3 = src.row(sy+2);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                float32x4_t _a0123 = vld1q_f32(alphap);
+
+                float32x4_t _S10 = vld1q_f32(S1p - 4);
+                float32x4_t _S11 = vld1q_f32(S1p + 0);
+                float32x4_t _S12 = vld1q_f32(S1p + 4);
+                float32x4_t _S13 = vld1q_f32(S1p + 8);
+                float32x4_t _S20 = vld1q_f32(S2p - 4);
+                float32x4_t _S21 = vld1q_f32(S2p + 0);
+                float32x4_t _S22 = vld1q_f32(S2p + 4);
+                float32x4_t _S23 = vld1q_f32(S2p + 8);
+                float32x4_t _S30 = vld1q_f32(S3p - 4);
+                float32x4_t _S31 = vld1q_f32(S3p + 0);
+                float32x4_t _S32 = vld1q_f32(S3p + 4);
+                float32x4_t _S33 = vld1q_f32(S3p + 8);
+                float32x4_t _rows1 = vmulq_lane_f32(_S10, vget_low_f32(_a0123), 0);
+                float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0);
+                float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0);
+                _rows1 = vmlaq_lane_f32(_rows1, _S11, vget_low_f32(_a0123), 1);
+                _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1);
+                _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1);
+                _rows1 = vmlaq_lane_f32(_rows1, _S12, vget_high_f32(_a0123), 0);
+                _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0);
+                _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0);
+                _rows1 = vmlaq_lane_f32(_rows1, _S13, vget_high_f32(_a0123), 1);
+                _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1);
+                _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1);
+                vst1q_f32(rows1p + dx * 4, _rows1);
+                vst1q_f32(rows2p + dx * 4, _rows2);
+                vst1q_f32(rows3p + dx * 4, _rows3);
+
+                alphap += 4;
+            }
+        }
+        else
+        {
+            // hresize four rows
+            const float* S0 = src.row(sy-1);
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy+1);
+            const float* S3 = src.row(sy+2);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                float32x4_t _a0123 = vld1q_f32(alphap);
+
+                // TODO check the generated assembly on armv7
+                float32x4_t _S00 = vld1q_f32(S0p - 4);
+                float32x4_t _S01 = vld1q_f32(S0p + 0);
+                float32x4_t _S02 = vld1q_f32(S0p + 4);
+                float32x4_t _S03 = vld1q_f32(S0p + 8);
+                float32x4_t _S10 = vld1q_f32(S1p - 4);
+                float32x4_t _S11 = vld1q_f32(S1p + 0);
+                float32x4_t _S12 = vld1q_f32(S1p + 4);
+                float32x4_t _S13 = vld1q_f32(S1p + 8);
+                float32x4_t _S20 = vld1q_f32(S2p - 4);
+                float32x4_t _S21 = vld1q_f32(S2p + 0);
+                float32x4_t _S22 = vld1q_f32(S2p + 4);
+                float32x4_t _S23 = vld1q_f32(S2p + 8);
+                float32x4_t _S30 = vld1q_f32(S3p - 4);
+                float32x4_t _S31 = vld1q_f32(S3p + 0);
+                float32x4_t _S32 = vld1q_f32(S3p + 4);
+                float32x4_t _S33 = vld1q_f32(S3p + 8);
+                float32x4_t _rows0 = vmulq_lane_f32(_S00, vget_low_f32(_a0123), 0);
+                float32x4_t _rows1 = vmulq_lane_f32(_S10, vget_low_f32(_a0123), 0);
+                float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0);
+                float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0);
+                _rows0 = vmlaq_lane_f32(_rows0, _S01, vget_low_f32(_a0123), 1);
+                _rows1 = vmlaq_lane_f32(_rows1, _S11, vget_low_f32(_a0123), 1);
+                _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1);
+                _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1);
+                _rows0 = vmlaq_lane_f32(_rows0, _S02, vget_high_f32(_a0123), 0);
+                _rows1 = vmlaq_lane_f32(_rows1, _S12, vget_high_f32(_a0123), 0);
+                _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0);
+                _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0);
+                _rows0 = vmlaq_lane_f32(_rows0, _S03, vget_high_f32(_a0123), 1);
+                _rows1 = vmlaq_lane_f32(_rows1, _S13, vget_high_f32(_a0123), 1);
+                _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1);
+                _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1);
+                vst1q_f32(rows0p + dx * 4, _rows0);
+                vst1q_f32(rows1p + dx * 4, _rows1);
+                vst1q_f32(rows2p + dx * 4, _rows2);
+                vst1q_f32(rows3p + dx * 4, _rows3);
+
+                alphap += 4;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        float32x4_t _b0123 = vld1q_f32(beta);
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* rows2p = rows2;
+        float* rows3p = rows3;
+        float* Dp = dst.row(dy);
+
+        for (int dx = 0; dx < w; dx++)
+        {
+            float32x4_t _rows0 = vld1q_f32(rows0p);
+            float32x4_t _rows1 = vld1q_f32(rows1p);
+            float32x4_t _rows2 = vld1q_f32(rows2p);
+            float32x4_t _rows3 = vld1q_f32(rows3p);
+            float32x4_t _D = vmulq_lane_f32(_rows0, vget_low_f32(_b0123), 0);
+            _D = vmlaq_lane_f32(_D, _rows1, vget_low_f32(_b0123), 1);
+            _D = vmlaq_lane_f32(_D, _rows2, vget_high_f32(_b0123), 0);
+            _D = vmlaq_lane_f32(_D, _rows3, vget_high_f32(_b0123), 1);
+            vst1q_f32(Dp, _D);
+
+            Dp += 4;
+            rows0p += 4;
+            rows1p += 4;
+            rows2p += 4;
+            rows3p += 4;
+        }
+
+        beta += 4;
+    }
+}
diff --git a/src/layer/arm/interp_bicubic_pack4_bf16s.h b/src/layer/arm/interp_bicubic_pack4_bf16s.h
new file mode 100644
index 000000000..df5d88d79
--- /dev/null
+++ b/src/layer/arm/interp_bicubic_pack4_bf16s.h
@@ -0,0 +1,272 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void resize_bicubic_image_pack4_bf16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf2(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf3(w, (size_t)4 * 4u, 4);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+    float* rows2 = rowsbuf2;
+    float* rows3 = rowsbuf3;
+
+    int prev_sy1 = -3;
+
+    for (int dy = 0; dy < h; dy++ )
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows2;
+            rows2 = rows3;
+            rows3 = rows0_old;
+            const unsigned short* S3 = src.row<const unsigned short>(sy+2);
+
+            const float* alphap = alpha;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const unsigned short* S3p = S3 + sx;
+
+                float32x4_t _a0123 = vld1q_f32(alphap);
+
+                float32x4_t _S30 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p - 4), 16));
+                float32x4_t _S31 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 0), 16));
+                float32x4_t _S32 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 4), 16));
+                float32x4_t _S33 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 8), 16));
+                float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0);
+                _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1);
+                _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0);
+                _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1);
+                vst1q_f32(rows3p + dx * 4, _rows3);
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 2)
+        {
+            // hresize two rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            rows0 = rows2;
+            rows1 = rows3;
+            rows2 = rows0_old;
+            rows3 = rows1_old;
+            const unsigned short* S2 = src.row<const unsigned short>(sy+1);
+            const unsigned short* S3 = src.row<const unsigned short>(sy+2);
+
+            const float* alphap = alpha;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const unsigned short* S2p = S2 + sx;
+                const unsigned short* S3p = S3 + sx;
+
+                float32x4_t _a0123 = vld1q_f32(alphap);
+
+                float32x4_t _S20 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p - 4), 16));
+                float32x4_t _S21 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 0), 16));
+                float32x4_t _S22 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 4), 16));
+                float32x4_t _S23 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 8), 16));
+                float32x4_t _S30 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p - 4), 16));
+                float32x4_t _S31 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 0), 16));
+                float32x4_t _S32 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 4), 16));
+                float32x4_t _S33 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 8), 16));
+                float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0);
+                float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0);
+                _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1);
+                _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1);
+                _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0);
+                _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0);
+                _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1);
+                _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1);
+                vst1q_f32(rows2p + dx * 4, _rows2);
+                vst1q_f32(rows3p + dx * 4, _rows3);
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 3)
+        {
+            // hresize three rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            float* rows2_old = rows2;
+            rows0 = rows3;
+            rows1 = rows0_old;
+            rows2 = rows1_old;
+            rows3 = rows2_old;
+            const unsigned short* S1 = src.row<const unsigned short>(sy);
+            const unsigned short* S2 = src.row<const unsigned short>(sy+1);
+            const unsigned short* S3 = src.row<const unsigned short>(sy+2);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const unsigned short* S1p = S1 + sx;
+                const unsigned short* S2p = S2 + sx;
+                const unsigned short* S3p = S3 + sx;
+
+                float32x4_t _a0123 = vld1q_f32(alphap);
+
+                float32x4_t _S10 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p - 4), 16));
+                float32x4_t _S11 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p + 0), 16));
+                float32x4_t _S12 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p + 4), 16));
+                float32x4_t _S13 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p + 8), 16));
+                float32x4_t _S20 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p - 4), 16));
+                float32x4_t _S21 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 0), 16));
+                float32x4_t _S22 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 4), 16));
+                float32x4_t _S23 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 8), 16));
+                float32x4_t _S30 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p - 4), 16));
+                float32x4_t _S31 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 0), 16));
+                float32x4_t _S32 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 4), 16));
+                float32x4_t _S33 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 8), 16));
+                float32x4_t _rows1 = vmulq_lane_f32(_S10, vget_low_f32(_a0123), 0);
+                float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0);
+                float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0);
+                _rows1 = vmlaq_lane_f32(_rows1, _S11, vget_low_f32(_a0123), 1);
+                _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1);
+                _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1);
+                _rows1 = vmlaq_lane_f32(_rows1, _S12, vget_high_f32(_a0123), 0);
+                _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0);
+                _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0);
+                _rows1 = vmlaq_lane_f32(_rows1, _S13, vget_high_f32(_a0123), 1);
+                _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1);
+                _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1);
+                vst1q_f32(rows1p + dx * 4, _rows1);
+                vst1q_f32(rows2p + dx * 4, _rows2);
+                vst1q_f32(rows3p + dx * 4, _rows3);
+
+                alphap += 4;
+            }
+        }
+        else
+        {
+            // hresize four rows
+            const unsigned short* S0 = src.row<const unsigned short>(sy-1);
+            const unsigned short* S1 = src.row<const unsigned short>(sy);
+            const unsigned short* S2 = src.row<const unsigned short>(sy+1);
+            const unsigned short* S3 = src.row<const unsigned short>(sy+2);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const unsigned short* S0p = S0 + sx;
+                const unsigned short* S1p = S1 + sx;
+                const unsigned short* S2p = S2 + sx;
+                const unsigned short* S3p = S3 + sx;
+
+                float32x4_t _a0123 = vld1q_f32(alphap);
+
+                // TODO check the generated assembly on armv7
+                float32x4_t _S00 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S0p - 4), 16));
+                float32x4_t _S01 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S0p + 0), 16));
+                float32x4_t _S02 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S0p + 4), 16));
+                float32x4_t _S03 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S0p + 8), 16));
+                float32x4_t _S10 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p - 4), 16));
+                float32x4_t _S11 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p + 0), 16));
+                float32x4_t _S12 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p + 4), 16));
+                float32x4_t _S13 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p + 8), 16));
+                float32x4_t _S20 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p - 4), 16));
+                float32x4_t _S21 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 0), 16));
+                float32x4_t _S22 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 4), 16));
+                float32x4_t _S23 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S2p + 8), 16));
+                float32x4_t _S30 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p - 4), 16));
+                float32x4_t _S31 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 0), 16));
+                float32x4_t _S32 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 4), 16));
+                float32x4_t _S33 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S3p + 8), 16));
+                float32x4_t _rows0 = vmulq_lane_f32(_S00, vget_low_f32(_a0123), 0);
+                float32x4_t _rows1 = vmulq_lane_f32(_S10, vget_low_f32(_a0123), 0);
+                float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0);
+                float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0);
+                _rows0 = vmlaq_lane_f32(_rows0, _S01, vget_low_f32(_a0123), 1);
+                _rows1 = vmlaq_lane_f32(_rows1, _S11, vget_low_f32(_a0123), 1);
+                _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1);
+                _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1);
+                _rows0 = vmlaq_lane_f32(_rows0, _S02, vget_high_f32(_a0123), 0);
+                _rows1 = vmlaq_lane_f32(_rows1, _S12, vget_high_f32(_a0123), 0);
+                _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0);
+                _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0);
+                _rows0 = vmlaq_lane_f32(_rows0, _S03, vget_high_f32(_a0123), 1);
+                _rows1 = vmlaq_lane_f32(_rows1, _S13, vget_high_f32(_a0123), 1);
+                _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1);
+                _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1);
+                vst1q_f32(rows0p + dx * 4, _rows0);
+                vst1q_f32(rows1p + dx * 4, _rows1);
+                vst1q_f32(rows2p + dx * 4, _rows2);
+                vst1q_f32(rows3p + dx * 4, _rows3);
+
+                alphap += 4;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        float32x4_t _b0123 = vld1q_f32(beta);
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* rows2p = rows2;
+        float* rows3p = rows3;
+        unsigned short* Dp = dst.row<unsigned short>(dy);
+
+        for (int dx = 0; dx < w; dx++)
+        {
+            float32x4_t _rows0 = vld1q_f32(rows0p);
+            float32x4_t _rows1 = vld1q_f32(rows1p);
+            float32x4_t _rows2 = vld1q_f32(rows2p);
+            float32x4_t _rows3 = vld1q_f32(rows3p);
+            float32x4_t _D = vmulq_lane_f32(_rows0, vget_low_f32(_b0123), 0);
+            _D = vmlaq_lane_f32(_D, _rows1, vget_low_f32(_b0123), 1);
+            _D = vmlaq_lane_f32(_D, _rows2, vget_high_f32(_b0123), 0);
+            _D = vmlaq_lane_f32(_D, _rows3, vget_high_f32(_b0123), 1);
+            vst1_u16(Dp, vshrn_n_u32(vreinterpretq_u32_f32(_D), 16));
+
+            Dp += 4;
+            rows0p += 4;
+            rows1p += 4;
+            rows2p += 4;
+            rows3p += 4;
+        }
+
+        beta += 4;
+    }
+}
diff --git a/src/layer/arm/interp_bilinear.h b/src/layer/arm/interp_bilinear.h
new file mode 100644
index 000000000..44839841e
--- /dev/null
+++ b/src/layer/arm/interp_bilinear.h
@@ -0,0 +1,213 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void linear_coeffs(int w, int outw, int* xofs, float* alpha)
+{
+    double scale = (double)w / outw;
+
+    for (int dx = 0; dx < outw; dx++)
+    {
+        float fx = (float)((dx + 0.5) * scale - 0.5);
+        int sx = floor(fx);
+        fx -= sx;
+
+        if (sx < 0)
+        {
+            sx = 0;
+            fx = 0.f;
+        }
+        if (sx >= w - 1)
+        {
+            sx = w - 2;
+            fx = 1.f;
+        }
+
+        xofs[dx] = sx;
+
+        alpha[dx*2    ] = 1.f - fx;
+        alpha[dx*2 + 1] = fx;
+    }
+}
+
+static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w);
+    Mat rowsbuf1(w);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+
+    int prev_sy1 = -2;
+
+    for (int dy = 0; dy < h; dy++ )
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows0_old;
+            const float* S1 = src.row(sy+1);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            int dx = 0;
+#if __ARM_NEON
+            for ( ; dx+1 < w; dx += 2 )
+            {
+                int sx = xofs[dx];
+                int sxn = xofs[dx+1];
+                const float* S1p = S1 + sx;
+                const float* S1np = S1 + sxn;
+
+                float32x4_t _a = vld1q_f32(alphap);
+                float32x2_t _S1 = vld1_f32(S1p);
+                float32x2_t _S1n = vld1_f32(S1np);
+
+                float32x4_t _S1S1n = vcombine_f32(_S1, _S1n);
+                float32x4_t _ms1 = vmulq_f32(_S1S1n, _a);
+                float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));
+
+                vst1_f32(rows1p + dx, _rows1);
+
+                alphap += 4;
+            }
+#endif // __ARM_NEON
+            for ( ; dx < w; dx++ )
+            {
+                int sx = xofs[dx];
+                const float* S1p = S1 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                rows1p[dx] = S1p[0]*a0 + S1p[1]*a1;
+
+                alphap += 2;
+            }
+        }
+        else
+        {
+            // hresize two rows
+            const float* S0 = src.row(sy);
+            const float* S1 = src.row(sy+1);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            int dx = 0;
+#if __ARM_NEON
+            for ( ; dx+1 < w; dx += 2 )
+            {
+                int sx = xofs[dx];
+                int sxn = xofs[dx+1];
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+                const float* S0np = S0 + sxn;
+                const float* S1np = S1 + sxn;
+
+                float32x4_t _a = vld1q_f32(alphap);
+                float32x2_t _S0 = vld1_f32(S0p);
+                float32x2_t _S1 = vld1_f32(S1p);
+                float32x2_t _S0n = vld1_f32(S0np);
+                float32x2_t _S1n = vld1_f32(S1np);
+
+                float32x4_t _S0S0n = vcombine_f32(_S0, _S0n);
+                float32x4_t _S1S1n = vcombine_f32(_S1, _S1n);
+                float32x4_t _ms0 = vmulq_f32(_S0S0n, _a);
+                float32x4_t _ms1 = vmulq_f32(_S1S1n, _a);
+                float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0));
+                float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));
+
+                vst1_f32(rows0p + dx, _rows0);
+                vst1_f32(rows1p + dx, _rows1);
+
+                alphap += 4;
+            }
+#endif // __ARM_NEON
+            for ( ; dx < w; dx++ )
+            {
+                int sx = xofs[dx];
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                rows0p[dx] = S0p[0]*a0 + S0p[1]*a1;
+                rows1p[dx] = S1p[0]*a0 + S1p[1]*a1;
+
+                alphap += 2;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        float b0 = beta[0];
+        float b1 = beta[1];
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* Dp = dst.row(dy);
+
+#if __ARM_NEON
+        int nn = w >> 3;
+#else
+        int nn = 0;
+#endif
+        int remain = w - (nn << 3);
+
+#if __ARM_NEON
+        float32x4_t _b0 = vdupq_n_f32(b0);
+        float32x4_t _b1 = vdupq_n_f32(b1);
+        for (; nn>0; nn--)
+        {
+            float32x4_t _rows0 = vld1q_f32(rows0p);
+            float32x4_t _rows1 = vld1q_f32(rows1p);
+
+            float32x4_t _D = vmulq_f32(_rows0, _b0);
+            _D = vmlaq_f32(_D, _rows1, _b1);
+
+            vst1q_f32(Dp, _D);
+
+            float32x4_t _rows0n = vld1q_f32(rows0p+4);
+            float32x4_t _rows1n = vld1q_f32(rows1p+4);
+
+            float32x4_t _Dn = vmulq_f32(_rows0n, _b0);
+            _Dn = vmlaq_f32(_Dn, _rows1n, _b1);
+
+            vst1q_f32(Dp+4, _Dn);
+
+            Dp += 8;
+            rows0p += 8;
+            rows1p += 8;
+        }
+#endif // __ARM_NEON
+        for ( ; remain; --remain )
+        {
+//             D[x] = rows0[x]*b0 + rows1[x]*b1;
+            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1;
+        }
+
+        beta += 2;
+    }
+}
diff --git a/src/layer/arm/interp_bilinear_bf16s.h b/src/layer/arm/interp_bilinear_bf16s.h
new file mode 100644
index 000000000..b4a0c992f
--- /dev/null
+++ b/src/layer/arm/interp_bilinear_bf16s.h
@@ -0,0 +1,135 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void resize_bilinear_image_bf16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w);
+    Mat rowsbuf1(w);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+
+    int prev_sy1 = -2;
+
+    for (int dy = 0; dy < h; dy++ )
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows0_old;
+            const unsigned short* S1 = src.row<const unsigned short>(sy+1);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            int dx = 0;
+            for ( ; dx < w; dx++ )
+            {
+                int sx = xofs[dx];
+                const unsigned short* S1p = S1 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                rows1p[dx] = bfloat16_to_float32(S1p[0])*a0 + bfloat16_to_float32(S1p[1])*a1;
+
+                alphap += 2;
+            }
+        }
+        else
+        {
+            // hresize two rows
+            const unsigned short* S0 = src.row<const unsigned short>(sy);
+            const unsigned short* S1 = src.row<const unsigned short>(sy+1);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            int dx = 0;
+            for ( ; dx < w; dx++ )
+            {
+                int sx = xofs[dx];
+                const unsigned short* S0p = S0 + sx;
+                const unsigned short* S1p = S1 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                rows0p[dx] = bfloat16_to_float32(S0p[0])*a0 + bfloat16_to_float32(S0p[1])*a1;
+                rows1p[dx] = bfloat16_to_float32(S1p[0])*a0 + bfloat16_to_float32(S1p[1])*a1;
+
+                alphap += 2;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        float b0 = beta[0];
+        float b1 = beta[1];
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        unsigned short* Dp = dst.row<unsigned short>(dy);
+
+#if __ARM_NEON
+        int nn = w >> 3;
+#else
+        int nn = 0;
+#endif
+        int remain = w - (nn << 3);
+
+#if __ARM_NEON
+        float32x4_t _b0 = vdupq_n_f32(b0);
+        float32x4_t _b1 = vdupq_n_f32(b1);
+        for (; nn>0; nn--)
+        {
+            float32x4_t _rows0 = vld1q_f32(rows0p);
+            float32x4_t _rows1 = vld1q_f32(rows1p);
+
+            float32x4_t _D = vmulq_f32(_rows0, _b0);
+            _D = vmlaq_f32(_D, _rows1, _b1);
+
+            vst1_u16(Dp, vshrn_n_u32(vreinterpretq_u32_f32(_D), 16));
+
+            float32x4_t _rows0n = vld1q_f32(rows0p+4);
+            float32x4_t _rows1n = vld1q_f32(rows1p+4);
+
+            float32x4_t _Dn = vmulq_f32(_rows0n, _b0);
+            _Dn = vmlaq_f32(_Dn, _rows1n, _b1);
+
+            vst1_u16(Dp+4, vshrn_n_u32(vreinterpretq_u32_f32(_Dn), 16));
+
+            Dp += 8;
+            rows0p += 8;
+            rows1p += 8;
+        }
+#endif // __ARM_NEON
+        for ( ; remain; --remain )
+        {
+//             D[x] = rows0[x]*b0 + rows1[x]*b1;
+            *Dp++ = float32_to_bfloat16(*rows0p++ * b0 + *rows1p++ * b1);
+        }
+
+        beta += 2;
+    }
+}
diff --git a/src/layer/arm/interp_bilinear_pack4.h b/src/layer/arm/interp_bilinear_pack4.h
new file mode 100644
index 000000000..3a6698469
--- /dev/null
+++ b/src/layer/arm/interp_bilinear_pack4.h
@@ -0,0 +1,120 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void resize_bilinear_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+
+    int prev_sy1 = -2;
+
+    for (int dy = 0; dy < h; dy++ )
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows0_old;
+            const float* S1 = src.row(sy+1);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            int dx = 0;
+            for ( ; dx < w; dx++ )
+            {
+                int sx = xofs[dx] * 4;
+                const float* S1p = S1 + sx;
+
+                float32x2_t _a01 = vld1_f32(alphap);
+
+                float32x4_t _S10 = vld1q_f32(S1p);
+                float32x4_t _S11 = vld1q_f32(S1p + 4);
+                float32x4_t _rows1 = vmulq_lane_f32(_S10, _a01, 0);
+                _rows1 = vmlaq_lane_f32(_rows1, _S11, _a01, 1);
+                vst1q_f32(rows1p + dx * 4, _rows1);
+
+                alphap += 2;
+            }
+        }
+        else
+        {
+            // hresize two rows
+            const float* S0 = src.row(sy);
+            const float* S1 = src.row(sy+1);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            int dx = 0;
+            for ( ; dx < w; dx++ )
+            {
+                int sx = xofs[dx] * 4;
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+
+                float32x2_t _a01 = vld1_f32(alphap);
+
+                float32x4_t _S00 = vld1q_f32(S0p);
+                float32x4_t _S01 = vld1q_f32(S0p + 4);
+                float32x4_t _S10 = vld1q_f32(S1p);
+                float32x4_t _S11 = vld1q_f32(S1p + 4);
+                float32x4_t _rows0 = vmulq_lane_f32(_S00, _a01, 0);
+                float32x4_t _rows1 = vmulq_lane_f32(_S10, _a01, 0);
+                _rows0 = vmlaq_lane_f32(_rows0, _S01, _a01, 1);
+                _rows1 = vmlaq_lane_f32(_rows1, _S11, _a01, 1);
+                vst1q_f32(rows0p + dx * 4, _rows0);
+                vst1q_f32(rows1p + dx * 4, _rows1);
+
+                alphap += 2;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        float32x2_t _b01 = vld1_f32(beta);
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* Dp = dst.row(dy);
+
+        for (int dx = 0; dx < w; dx++)
+        {
+            float32x4_t _rows0 = vld1q_f32(rows0p);
+            float32x4_t _rows1 = vld1q_f32(rows1p);
+            float32x4_t _D = vmulq_lane_f32(_rows0, _b01, 0);
+            _D = vmlaq_lane_f32(_D, _rows1, _b01, 1);
+            vst1q_f32(Dp, _D);
+
+            Dp += 4;
+            rows0p += 4;
+            rows1p += 4;
+        }
+
+        beta += 2;
+    }
+}
diff --git a/src/layer/arm/interp_bilinear_pack4_bf16s.h b/src/layer/arm/interp_bilinear_pack4_bf16s.h
new file mode 100644
index 000000000..4acbdcc7c
--- /dev/null
+++ b/src/layer/arm/interp_bilinear_pack4_bf16s.h
@@ -0,0 +1,120 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void resize_bilinear_image_pack4_bf16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+
+    int prev_sy1 = -2;
+
+    for (int dy = 0; dy < h; dy++ )
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows0_old;
+            const unsigned short* S1 = src.row<const unsigned short>(sy+1);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            int dx = 0;
+            for ( ; dx < w; dx++ )
+            {
+                int sx = xofs[dx] * 4;
+                const unsigned short* S1p = S1 + sx;
+
+                float32x2_t _a01 = vld1_f32(alphap);
+
+                float32x4_t _S10 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p), 16));
+                float32x4_t _S11 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p + 4), 16));
+                float32x4_t _rows1 = vmulq_lane_f32(_S10, _a01, 0);
+                _rows1 = vmlaq_lane_f32(_rows1, _S11, _a01, 1);
+                vst1q_f32(rows1p + dx * 4, _rows1);
+
+                alphap += 2;
+            }
+        }
+        else
+        {
+            // hresize two rows
+            const unsigned short* S0 = src.row<const unsigned short>(sy);
+            const unsigned short* S1 = src.row<const unsigned short>(sy+1);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            int dx = 0;
+            for ( ; dx < w; dx++ )
+            {
+                int sx = xofs[dx] * 4;
+                const unsigned short* S0p = S0 + sx;
+                const unsigned short* S1p = S1 + sx;
+
+                float32x2_t _a01 = vld1_f32(alphap);
+
+                float32x4_t _S00 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S0p), 16));
+                float32x4_t _S01 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S0p + 4), 16));
+                float32x4_t _S10 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p), 16));
+                float32x4_t _S11 = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(S1p + 4), 16));
+                float32x4_t _rows0 = vmulq_lane_f32(_S00, _a01, 0);
+                float32x4_t _rows1 = vmulq_lane_f32(_S10, _a01, 0);
+                _rows0 = vmlaq_lane_f32(_rows0, _S01, _a01, 1);
+                _rows1 = vmlaq_lane_f32(_rows1, _S11, _a01, 1);
+                vst1q_f32(rows0p + dx * 4, _rows0);
+                vst1q_f32(rows1p + dx * 4, _rows1);
+
+                alphap += 2;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        float32x2_t _b01 = vld1_f32(beta);
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        unsigned short* Dp = dst.row<unsigned short>(dy);
+
+        for (int dx = 0; dx < w; dx++)
+        {
+            float32x4_t _rows0 = vld1q_f32(rows0p);
+            float32x4_t _rows1 = vld1q_f32(rows1p);
+            float32x4_t _D = vmulq_lane_f32(_rows0, _b01, 0);
+            _D = vmlaq_lane_f32(_D, _rows1, _b01, 1);
+            vst1_u16(Dp, vshrn_n_u32(vreinterpretq_u32_f32(_D), 16));
+
+            Dp += 4;
+            rows0p += 4;
+            rows1p += 4;
+        }
+
+        beta += 2;
+    }
+}
diff --git a/src/layer/interp.cpp b/src/layer/interp.cpp
index 6af83ee70..9d6782ea8 100644
--- a/src/layer/interp.cpp
+++ b/src/layer/interp.cpp
@@ -33,6 +33,12 @@ int Interp::load_param(const ParamDict& pd)
     output_height = pd.get(3, 0);
     output_width = pd.get(4, 0);
 
+    if (resize_type < 1 || resize_type > 3)
+    {
+        fprintf(stderr, "unsupported resize type %d\n", resize_type);
+        return -1;
+    }
+
     return 0;
 }
 
@@ -391,37 +397,38 @@ static void resize_bicubic_image(const Mat& src, Mat& dst, float* alpha, int* xo
 
 int Interp::forward(const Mat &bottom_blob, Mat &top_blob, const Option& opt) const
 {
-    int h = bottom_blob.h;
     int w = bottom_blob.w;
-    int c = bottom_blob.c;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
     size_t elemsize = bottom_blob.elemsize;
 
-    int oh = output_height;
-    int ow = output_width;
+    int outh = output_height;
+    int outw = output_width;
     if (bottom_blob.dims == 1)
     {
         h = 1;
         w = 1;
-        c = bottom_blob.w;
+        channels = bottom_blob.w;
     }
-    if (oh == 0 || ow == 0)
+    if (outh == 0 || outw == 0)
     {
-        oh = static_cast<int>(h * height_scale);
-        ow = static_cast<int>(w * width_scale);
+        outh = static_cast<int>(h * height_scale);
+        outw = static_cast<int>(w * width_scale);
     }
-    if (oh == h && ow == w)
+    if (outh == h && outw == w)
     {
         top_blob = bottom_blob;
         return 0;
     }
-    top_blob.create(ow, oh, c, elemsize, opt.blob_allocator);
+
+    top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator);
     if (top_blob.empty())
         return -100;
 
     if (bottom_blob.dims == 1)
     {
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < c; ++q)
+        for (int q = 0; q < channels; q++)
         {
             Mat top_blob_c = top_blob.channel(q);
             const float *ptr = ((const float*)bottom_blob.data + q);
@@ -436,38 +443,37 @@ int Interp::forward(const Mat &bottom_blob, Mat &top_blob, const Option& opt) co
         const float ws = output_width ? w / (float)output_width : 1.f / width_scale;
 
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < c; q++)
+        for (int q = 0; q < channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
             float* outptr = top_blob.channel(q);
-            for (int y = 0; y < oh; y++)
+            for (int y = 0; y < outh; y++)
             {
                 int in_y = std::min((int) (y * hs), (h - 1));
-                for (int x = 0; x < ow; x++)
+                for (int x = 0; x < outw; x++)
                 {
                     int in_x = std::min((int) (x * ws), (w - 1));
                     *outptr++ = ptr[in_y * w + in_x];
                 }
             }
         }
-
-        return 0;
     }
-    else if (resize_type == 2)// bilinear
+
+    if (resize_type == 2)// bilinear
     {
-        int* buf = new int[ow + oh + ow*2 + oh*2];
+        int* buf = new int[outw + outh + outw*2 + outh*2];
 
-        int* xofs = buf;//new int[ow];
-        int* yofs = buf + ow;//new int[oh];
+        int* xofs = buf;//new int[outw];
+        int* yofs = buf + outw;//new int[outh];
 
-        float* alpha = (float*)(buf + ow + oh);//new float[ow * 2];
-        float* beta = (float*)(buf + ow + oh + ow*2);//new float[oh * 2];
+        float* alpha = (float*)(buf + outw + outh);//new float[outw * 2];
+        float* beta = (float*)(buf + outw + outh + outw*2);//new float[outh * 2];
 
-        linear_coeffs(w, ow, xofs, alpha);
-        linear_coeffs(h, oh, yofs, beta);
+        linear_coeffs(w, outw, xofs, alpha);
+        linear_coeffs(h, outh, yofs, beta);
 
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < c; ++q)
+        for (int q = 0; q < channels; ++q)
         {
             const Mat src = bottom_blob.channel(q);
             Mat dst = top_blob.channel(q);
@@ -476,24 +482,23 @@ int Interp::forward(const Mat &bottom_blob, Mat &top_blob, const Option& opt) co
         }
 
         delete[] buf;
-
-        return 0;
     }
-    else if (resize_type == 3)// bicubic
+
+    if (resize_type == 3)// bicubic
     {
-        int* buf = new int[ow + oh + ow*4 + oh*4];
+        int* buf = new int[outw + outh + outw*4 + outh*4];
 
-        int* xofs = buf;//new int[ow];
-        int* yofs = buf + ow;//new int[oh];
+        int* xofs = buf;//new int[outw];
+        int* yofs = buf + outw;//new int[outh];
 
-        float* alpha = (float*)(buf + ow + oh);//new float[ow * 4];
-        float* beta = (float*)(buf + ow + oh + ow*4);//new float[oh * 4];
+        float* alpha = (float*)(buf + outw + outh);//new float[outw * 4];
+        float* beta = (float*)(buf + outw + outh + outw*4);//new float[outh * 4];
 
-        cubic_coeffs(w, ow, xofs, alpha);
-        cubic_coeffs(h, oh, yofs, beta);
+        cubic_coeffs(w, outw, xofs, alpha);
+        cubic_coeffs(h, outh, yofs, beta);
 
         #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < c; ++q)
+        for (int q = 0; q < channels; q++)
         {
             const Mat src = bottom_blob.channel(q);
             Mat dst = top_blob.channel(q);
@@ -502,14 +507,9 @@ int Interp::forward(const Mat &bottom_blob, Mat &top_blob, const Option& opt) co
         }
 
         delete[] buf;
-
-        return 0;
-    }
-    else
-    {
-        fprintf(stderr, "unsupported resize type %d %d %d\n", resize_type, oh, ow);
-        return -233;
     }
+
+    return 0;
 }
 
 } // namespace ncnn