diff --git a/src/layer/x86/interp_bicubic.h b/src/layer/x86/interp_bicubic.h
new file mode 100644
index 000000000..344996cb6
--- /dev/null
+++ b/src/layer/x86/interp_bicubic.h
@@ -0,0 +1,310 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static inline void interpolate_cubic(float fx, float* coeffs)
+{
+    const float A = -0.75f;
+
+    float fx0 = fx + 1;
+    float fx1 = fx;
+    float fx2 = 1 - fx;
+    // float fx3 = 2 - fx;
+
+    coeffs[0] = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A;
+    coeffs[1] = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1;
+    coeffs[2] = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+
+static void cubic_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner)
+{
+    double scale = (double)w / outw;
+    if (align_corner)
+    {
+        scale = (double)(w - 1) / (outw - 1);
+    }
+
+    for (int dx = 0; dx < outw; dx++)
+    {
+        float fx = (float)((dx + 0.5) * scale - 0.5);
+        if (align_corner)
+        {
+            fx = (float)(dx * scale);
+        }
+
+        int sx = static_cast<int>(floor(fx));
+        fx -= sx;
+
+        interpolate_cubic(fx, alpha + dx * 4);
+
+        if (sx <= -1)
+        {
+            sx = 1;
+            alpha[dx * 4 + 0] = 1.f - alpha[dx * 4 + 3];
+            alpha[dx * 4 + 1] = alpha[dx * 4 + 3];
+            alpha[dx * 4 + 2] = 0.f;
+            alpha[dx * 4 + 3] = 0.f;
+        }
+        if (sx == 0)
+        {
+            sx = 1;
+            alpha[dx * 4 + 0] = alpha[dx * 4 + 0] + alpha[dx * 4 + 1];
+            alpha[dx * 4 + 1] = alpha[dx * 4 + 2];
+            alpha[dx * 4 + 2] = alpha[dx * 4 + 3];
+            alpha[dx * 4 + 3] = 0.f;
+        }
+        if (sx == w - 2)
+        {
+            sx = w - 3;
+            alpha[dx * 4 + 3] = alpha[dx * 4 + 2] + alpha[dx * 4 + 3];
+            alpha[dx * 4 + 2] = alpha[dx * 4 + 1];
+            alpha[dx * 4 + 1] = alpha[dx * 4 + 0];
+            alpha[dx * 4 + 0] = 0.f;
+        }
+        if (sx >= w - 1)
+        {
+            sx = w - 3;
+            alpha[dx * 4 + 3] = 1.f - alpha[dx * 4 + 0];
+            alpha[dx * 4 + 2] = alpha[dx * 4 + 0];
+            alpha[dx * 4 + 1] = 0.f;
+            alpha[dx * 4 + 0] = 0.f;
+        }
+
+        xofs[dx] = sx;
+    }
+}
+
+static void resize_bicubic_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w);
+    Mat rowsbuf1(w);
+    Mat rowsbuf2(w);
+    Mat rowsbuf3(w);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+    float* rows2 = rowsbuf2;
+    float* rows3 = rowsbuf3;
+
+    int prev_sy1 = -3;
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows2;
+            rows2 = rows3;
+            rows3 = rows0_old;
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 2)
+        {
+            // hresize two rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            rows0 = rows2;
+            rows1 = rows3;
+            rows2 = rows0_old;
+            rows3 = rows1_old;
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
+                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 3)
+        {
+            // hresize three rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            float* rows2_old = rows2;
+            rows0 = rows3;
+            rows1 = rows0_old;
+            rows2 = rows1_old;
+            rows3 = rows2_old;
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3;
+                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
+                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;
+
+                alphap += 4;
+            }
+        }
+        else
+        {
+            // hresize four rows
+            const float* S0 = src.row(sy - 1);
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                float a2 = alphap[2];
+                float a3 = alphap[3];
+                rows0p[dx] = S0p[-1] * a0 + S0p[0] * a1 + S0p[1] * a2 + S0p[2] * a3;
+                rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3;
+                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
+                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;
+
+                alphap += 4;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        float b0 = beta[0];
+        float b1 = beta[1];
+        float b2 = beta[2];
+        float b3 = beta[3];
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* rows2p = rows2;
+        float* rows3p = rows3;
+        float* Dp = dst.row(dy);
+
+        int dx = 0;
+#if __SSE2__
+#if __AVX__
+        __m256 _b0_256 = _mm256_set1_ps(b0);
+        __m256 _b1_256 = _mm256_set1_ps(b1);
+        __m256 _b2_256 = _mm256_set1_ps(b2);
+        __m256 _b3_256 = _mm256_set1_ps(b3);
+        for (; dx + 7 < w; dx += 8)
+        {
+            __m256 _rows0 = _mm256_loadu_ps(rows0p);
+            __m256 _rows1 = _mm256_loadu_ps(rows1p);
+            __m256 _rows2 = _mm256_loadu_ps(rows2p);
+            __m256 _rows3 = _mm256_loadu_ps(rows3p);
+            __m256 _D = _mm256_mul_ps(_rows0, _b0_256);
+            _D = _mm256_comp_fmadd_ps(_rows1, _b1_256, _D);
+            _D = _mm256_comp_fmadd_ps(_rows2, _b2_256, _D);
+            _D = _mm256_comp_fmadd_ps(_rows3, _b3_256, _D);
+            _mm256_storeu_ps(Dp, _D);
+
+            Dp += 8;
+            rows0p += 8;
+            rows1p += 8;
+            rows2p += 8;
+            rows3p += 8;
+        }
+#endif // __AVX__
+        __m128 _b0_128 = _mm_set1_ps(b0);
+        __m128 _b1_128 = _mm_set1_ps(b1);
+        __m128 _b2_128 = _mm_set1_ps(b2);
+        __m128 _b3_128 = _mm_set1_ps(b3);
+        for (; dx + 3 < w; dx += 4)
+        {
+            __m128 _rows0 = _mm_loadu_ps(rows0p);
+            __m128 _rows1 = _mm_loadu_ps(rows1p);
+            __m128 _rows2 = _mm_loadu_ps(rows2p);
+            __m128 _rows3 = _mm_loadu_ps(rows3p);
+            __m128 _D = _mm_mul_ps(_rows0, _b0_128);
+            _D = _mm_comp_fmadd_ps(_rows1, _b1_128, _D);
+            _D = _mm_comp_fmadd_ps(_rows2, _b2_128, _D);
+            _D = _mm_comp_fmadd_ps(_rows3, _b3_128, _D);
+            _mm_storeu_ps(Dp, _D);
+
+            Dp += 4;
+            rows0p += 4;
+            rows1p += 4;
+            rows2p += 4;
+            rows3p += 4;
+        }
+#endif // __SSE2__
+        for (; dx < w; dx++)
+        {
+            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3;
+        }
+
+        beta += 4;
+    }
+}
diff --git a/src/layer/x86/interp_bicubic_pack4.h b/src/layer/x86/interp_bicubic_pack4.h
new file mode 100644
index 000000000..5718bdc87
--- /dev/null
+++ b/src/layer/x86/interp_bicubic_pack4.h
@@ -0,0 +1,286 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void resize_bicubic_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf2(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf3(w, (size_t)4 * 4u, 4);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+    float* rows2 = rowsbuf2;
+    float* rows3 = rowsbuf3;
+
+    int prev_sy1 = -3;
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows2;
+            rows2 = rows3;
+            rows3 = rows0_old;
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S3p = S3 + sx;
+
+                __m128 _a0 = _mm_set1_ps(alphap[0]);
+                __m128 _a1 = _mm_set1_ps(alphap[1]);
+                __m128 _a2 = _mm_set1_ps(alphap[2]);
+                __m128 _a3 = _mm_set1_ps(alphap[3]);
+
+                __m128 _S30 = _mm_load_ps(S3p - 4);
+                __m128 _S31 = _mm_load_ps(S3p + 0);
+                __m128 _S32 = _mm_load_ps(S3p + 4);
+                __m128 _S33 = _mm_load_ps(S3p + 8);
+                __m128 _rows3 = _mm_mul_ps(_S30, _a0);
+                _rows3 = _mm_comp_fmadd_ps(_S31, _a1, _rows3);
+                _rows3 = _mm_comp_fmadd_ps(_S32, _a2, _rows3);
+                _rows3 = _mm_comp_fmadd_ps(_S33, _a3, _rows3);
+                _mm_store_ps(rows3p + dx * 4, _rows3);
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 2)
+        {
+            // hresize two rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            rows0 = rows2;
+            rows1 = rows3;
+            rows2 = rows0_old;
+            rows3 = rows1_old;
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                __m128 _a0 = _mm_set1_ps(alphap[0]);
+                __m128 _a1 = _mm_set1_ps(alphap[1]);
+                __m128 _a2 = _mm_set1_ps(alphap[2]);
+                __m128 _a3 = _mm_set1_ps(alphap[3]);
+
+                __m128 _S20 = _mm_load_ps(S2p - 4);
+                __m128 _S21 = _mm_load_ps(S2p + 0);
+                __m128 _S22 = _mm_load_ps(S2p + 4);
+                __m128 _S23 = _mm_load_ps(S2p + 8);
+                __m128 _S30 = _mm_load_ps(S3p - 4);
+                __m128 _S31 = _mm_load_ps(S3p + 0);
+                __m128 _S32 = _mm_load_ps(S3p + 4);
+                __m128 _S33 = _mm_load_ps(S3p + 8);
+                __m128 _rows2 = _mm_mul_ps(_S20, _a0);
+                __m128 _rows3 = _mm_mul_ps(_S30, _a0);
+                _rows2 = _mm_comp_fmadd_ps(_S21, _a1, _rows2);
+                _rows3 = _mm_comp_fmadd_ps(_S31, _a1, _rows3);
+                _rows2 = _mm_comp_fmadd_ps(_S22, _a2, _rows2);
+                _rows3 = _mm_comp_fmadd_ps(_S32, _a2, _rows3);
+                _rows2 = _mm_comp_fmadd_ps(_S23, _a3, _rows2);
+                _rows3 = _mm_comp_fmadd_ps(_S33, _a3, _rows3);
+                _mm_store_ps(rows2p + dx * 4, _rows2);
+                _mm_store_ps(rows3p + dx * 4, _rows3);
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 3)
+        {
+            // hresize three rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            float* rows2_old = rows2;
+            rows0 = rows3;
+            rows1 = rows0_old;
+            rows2 = rows1_old;
+            rows3 = rows2_old;
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                __m128 _a0 = _mm_set1_ps(alphap[0]);
+                __m128 _a1 = _mm_set1_ps(alphap[1]);
+                __m128 _a2 = _mm_set1_ps(alphap[2]);
+                __m128 _a3 = _mm_set1_ps(alphap[3]);
+
+                __m128 _S10 = _mm_load_ps(S1p - 4);
+                __m128 _S11 = _mm_load_ps(S1p + 0);
+                __m128 _S12 = _mm_load_ps(S1p + 4);
+                __m128 _S13 = _mm_load_ps(S1p + 8);
+                __m128 _S20 = _mm_load_ps(S2p - 4);
+                __m128 _S21 = _mm_load_ps(S2p + 0);
+                __m128 _S22 = _mm_load_ps(S2p + 4);
+                __m128 _S23 = _mm_load_ps(S2p + 8);
+                __m128 _S30 = _mm_load_ps(S3p - 4);
+                __m128 _S31 = _mm_load_ps(S3p + 0);
+                __m128 _S32 = _mm_load_ps(S3p + 4);
+                __m128 _S33 = _mm_load_ps(S3p + 8);
+                __m128 _rows1 = _mm_mul_ps(_S10, _a0);
+                __m128 _rows2 = _mm_mul_ps(_S20, _a0);
+                __m128 _rows3 = _mm_mul_ps(_S30, _a0);
+                _rows1 = _mm_comp_fmadd_ps(_S11, _a1, _rows1);
+                _rows2 = _mm_comp_fmadd_ps(_S21, _a1, _rows2);
+                _rows3 = _mm_comp_fmadd_ps(_S31, _a1, _rows3);
+                _rows1 = _mm_comp_fmadd_ps(_S12, _a2, _rows1);
+                _rows2 = _mm_comp_fmadd_ps(_S22, _a2, _rows2);
+                _rows3 = _mm_comp_fmadd_ps(_S32, _a2, _rows3);
+                _rows1 = _mm_comp_fmadd_ps(_S13, _a3, _rows1);
+                _rows2 = _mm_comp_fmadd_ps(_S23, _a3, _rows2);
+                _rows3 = _mm_comp_fmadd_ps(_S33, _a3, _rows3);
+                _mm_store_ps(rows1p + dx * 4, _rows1);
+                _mm_store_ps(rows2p + dx * 4, _rows2);
+                _mm_store_ps(rows3p + dx * 4, _rows3);
+
+                alphap += 4;
+            }
+        }
+        else
+        {
+            // hresize four rows
+            const float* S0 = src.row(sy - 1);
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                __m128 _a0 = _mm_set1_ps(alphap[0]);
+                __m128 _a1 = _mm_set1_ps(alphap[1]);
+                __m128 _a2 = _mm_set1_ps(alphap[2]);
+                __m128 _a3 = _mm_set1_ps(alphap[3]);
+
+                __m128 _S00 = _mm_load_ps(S0p - 4);
+                __m128 _S01 = _mm_load_ps(S0p + 0);
+                __m128 _S02 = _mm_load_ps(S0p + 4);
+                __m128 _S03 = _mm_load_ps(S0p + 8);
+                __m128 _S10 = _mm_load_ps(S1p - 4);
+                __m128 _S11 = _mm_load_ps(S1p + 0);
+                __m128 _S12 = _mm_load_ps(S1p + 4);
+                __m128 _S13 = _mm_load_ps(S1p + 8);
+                __m128 _S20 = _mm_load_ps(S2p - 4);
+                __m128 _S21 = _mm_load_ps(S2p + 0);
+                __m128 _S22 = _mm_load_ps(S2p + 4);
+                __m128 _S23 = _mm_load_ps(S2p + 8);
+                __m128 _S30 = _mm_load_ps(S3p - 4);
+                __m128 _S31 = _mm_load_ps(S3p + 0);
+                __m128 _S32 = _mm_load_ps(S3p + 4);
+                __m128 _S33 = _mm_load_ps(S3p + 8);
+                __m128 _rows0 = _mm_mul_ps(_S00, _a0);
+                __m128 _rows1 = _mm_mul_ps(_S10, _a0);
+                __m128 _rows2 = _mm_mul_ps(_S20, _a0);
+                __m128 _rows3 = _mm_mul_ps(_S30, _a0);
+                _rows0 = _mm_comp_fmadd_ps(_S01, _a1, _rows0);
+                _rows1 = _mm_comp_fmadd_ps(_S11, _a1, _rows1);
+                _rows2 = _mm_comp_fmadd_ps(_S21, _a1, _rows2);
+                _rows3 = _mm_comp_fmadd_ps(_S31, _a1, _rows3);
+                _rows0 = _mm_comp_fmadd_ps(_S02, _a2, _rows0);
+                _rows1 = _mm_comp_fmadd_ps(_S12, _a2, _rows1);
+                _rows2 = _mm_comp_fmadd_ps(_S22, _a2, _rows2);
+                _rows3 = _mm_comp_fmadd_ps(_S32, _a2, _rows3);
+                _rows0 = _mm_comp_fmadd_ps(_S03, _a3, _rows0);
+                _rows1 = _mm_comp_fmadd_ps(_S13, _a3, _rows1);
+                _rows2 = _mm_comp_fmadd_ps(_S23, _a3, _rows2);
+                _rows3 = _mm_comp_fmadd_ps(_S33, _a3, _rows3);
+                _mm_store_ps(rows0p + dx * 4, _rows0);
+                _mm_store_ps(rows1p + dx * 4, _rows1);
+                _mm_store_ps(rows2p + dx * 4, _rows2);
+                _mm_store_ps(rows3p + dx * 4, _rows3);
+
+                alphap += 4;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        __m128 _b0 = _mm_set1_ps(beta[0]);
+        __m128 _b1 = _mm_set1_ps(beta[1]);
+        __m128 _b2 = _mm_set1_ps(beta[2]);
+        __m128 _b3 = _mm_set1_ps(beta[3]);
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* rows2p = rows2;
+        float* rows3p = rows3;
+        float* Dp = dst.row(dy);
+
+        for (int dx = 0; dx < w; dx++)
+        {
+            __m128 _rows0 = _mm_load_ps(rows0p);
+            __m128 _rows1 = _mm_load_ps(rows1p);
+            __m128 _rows2 = _mm_load_ps(rows2p);
+            __m128 _rows3 = _mm_load_ps(rows3p);
+            __m128 _D = _mm_mul_ps(_rows0, _b0);
+            _D = _mm_comp_fmadd_ps(_rows1, _b1, _D);
+            _D = _mm_comp_fmadd_ps(_rows2, _b2, _D);
+            _D = _mm_comp_fmadd_ps(_rows3, _b3, _D);
+            _mm_store_ps(Dp, _D);
+
+            Dp += 4;
+            rows0p += 4;
+            rows1p += 4;
+            rows2p += 4;
+            rows3p += 4;
+        }
+
+        beta += 4;
+    }
+}
diff --git a/src/layer/x86/interp_bicubic_pack8.h b/src/layer/x86/interp_bicubic_pack8.h
new file mode 100644
index 000000000..c70bc7b15
--- /dev/null
+++ b/src/layer/x86/interp_bicubic_pack8.h
@@ -0,0 +1,286 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void resize_bicubic_image_pack8(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w, (size_t)8 * 4u, 8);
+    Mat rowsbuf1(w, (size_t)8 * 4u, 8);
+    Mat rowsbuf2(w, (size_t)8 * 4u, 8);
+    Mat rowsbuf3(w, (size_t)8 * 4u, 8);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+    float* rows2 = rowsbuf2;
+    float* rows3 = rowsbuf3;
+
+    int prev_sy1 = -3;
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows2;
+            rows2 = rows3;
+            rows3 = rows0_old;
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 8;
+                const float* S3p = S3 + sx;
+
+                __m256 _a0 = _mm256_set1_ps(alphap[0]);
+                __m256 _a1 = _mm256_set1_ps(alphap[1]);
+                __m256 _a2 = _mm256_set1_ps(alphap[2]);
+                __m256 _a3 = _mm256_set1_ps(alphap[3]);
+
+                __m256 _S30 = _mm256_load_ps(S3p - 8);
+                __m256 _S31 = _mm256_load_ps(S3p + 0);
+                __m256 _S32 = _mm256_load_ps(S3p + 8);
+                __m256 _S33 = _mm256_load_ps(S3p + 16);
+                __m256 _rows3 = _mm256_mul_ps(_S30, _a0);
+                _rows3 = _mm256_comp_fmadd_ps(_S31, _a1, _rows3);
+                _rows3 = _mm256_comp_fmadd_ps(_S32, _a2, _rows3);
+                _rows3 = _mm256_comp_fmadd_ps(_S33, _a3, _rows3);
+                _mm256_store_ps(rows3p + dx * 8, _rows3);
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 2)
+        {
+            // hresize two rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            rows0 = rows2;
+            rows1 = rows3;
+            rows2 = rows0_old;
+            rows3 = rows1_old;
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 8;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                __m256 _a0 = _mm256_set1_ps(alphap[0]);
+                __m256 _a1 = _mm256_set1_ps(alphap[1]);
+                __m256 _a2 = _mm256_set1_ps(alphap[2]);
+                __m256 _a3 = _mm256_set1_ps(alphap[3]);
+
+                __m256 _S20 = _mm256_load_ps(S2p - 8);
+                __m256 _S21 = _mm256_load_ps(S2p + 0);
+                __m256 _S22 = _mm256_load_ps(S2p + 8);
+                __m256 _S23 = _mm256_load_ps(S2p + 16);
+                __m256 _S30 = _mm256_load_ps(S3p - 8);
+                __m256 _S31 = _mm256_load_ps(S3p + 0);
+                __m256 _S32 = _mm256_load_ps(S3p + 8);
+                __m256 _S33 = _mm256_load_ps(S3p + 16);
+                __m256 _rows2 = _mm256_mul_ps(_S20, _a0);
+                __m256 _rows3 = _mm256_mul_ps(_S30, _a0);
+                _rows2 = _mm256_comp_fmadd_ps(_S21, _a1, _rows2);
+                _rows3 = _mm256_comp_fmadd_ps(_S31, _a1, _rows3);
+                _rows2 = _mm256_comp_fmadd_ps(_S22, _a2, _rows2);
+                _rows3 = _mm256_comp_fmadd_ps(_S32, _a2, _rows3);
+                _rows2 = _mm256_comp_fmadd_ps(_S23, _a3, _rows2);
+                _rows3 = _mm256_comp_fmadd_ps(_S33, _a3, _rows3);
+                _mm256_store_ps(rows2p + dx * 8, _rows2);
+                _mm256_store_ps(rows3p + dx * 8, _rows3);
+
+                alphap += 4;
+            }
+        }
+        else if (sy == prev_sy1 + 3)
+        {
+            // hresize three rows
+            float* rows0_old = rows0;
+            float* rows1_old = rows1;
+            float* rows2_old = rows2;
+            rows0 = rows3;
+            rows1 = rows0_old;
+            rows2 = rows1_old;
+            rows3 = rows2_old;
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 8;
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                __m256 _a0 = _mm256_set1_ps(alphap[0]);
+                __m256 _a1 = _mm256_set1_ps(alphap[1]);
+                __m256 _a2 = _mm256_set1_ps(alphap[2]);
+                __m256 _a3 = _mm256_set1_ps(alphap[3]);
+
+                __m256 _S10 = _mm256_load_ps(S1p - 8);
+                __m256 _S11 = _mm256_load_ps(S1p + 0);
+                __m256 _S12 = _mm256_load_ps(S1p + 8);
+                __m256 _S13 = _mm256_load_ps(S1p + 16);
+                __m256 _S20 = _mm256_load_ps(S2p - 8);
+                __m256 _S21 = _mm256_load_ps(S2p + 0);
+                __m256 _S22 = _mm256_load_ps(S2p + 8);
+                __m256 _S23 = _mm256_load_ps(S2p + 16);
+                __m256 _S30 = _mm256_load_ps(S3p - 8);
+                __m256 _S31 = _mm256_load_ps(S3p + 0);
+                __m256 _S32 = _mm256_load_ps(S3p + 8);
+                __m256 _S33 = _mm256_load_ps(S3p + 16);
+                __m256 _rows1 = _mm256_mul_ps(_S10, _a0);
+                __m256 _rows2 = _mm256_mul_ps(_S20, _a0);
+                __m256 _rows3 = _mm256_mul_ps(_S30, _a0);
+                _rows1 = _mm256_comp_fmadd_ps(_S11, _a1, _rows1);
+                _rows2 = _mm256_comp_fmadd_ps(_S21, _a1, _rows2);
+                _rows3 = _mm256_comp_fmadd_ps(_S31, _a1, _rows3);
+                _rows1 = _mm256_comp_fmadd_ps(_S12, _a2, _rows1);
+                _rows2 = _mm256_comp_fmadd_ps(_S22, _a2, _rows2);
+                _rows3 = _mm256_comp_fmadd_ps(_S32, _a2, _rows3);
+                _rows1 = _mm256_comp_fmadd_ps(_S13, _a3, _rows1);
+                _rows2 = _mm256_comp_fmadd_ps(_S23, _a3, _rows2);
+                _rows3 = _mm256_comp_fmadd_ps(_S33, _a3, _rows3);
+                _mm256_store_ps(rows1p + dx * 8, _rows1);
+                _mm256_store_ps(rows2p + dx * 8, _rows2);
+                _mm256_store_ps(rows3p + dx * 8, _rows3);
+
+                alphap += 4;
+            }
+        }
+        else
+        {
+            // hresize four rows
+            const float* S0 = src.row(sy - 1);
+            const float* S1 = src.row(sy);
+            const float* S2 = src.row(sy + 1);
+            const float* S3 = src.row(sy + 2);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            float* rows2p = rows2;
+            float* rows3p = rows3;
+            for (int dx = 0; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 8;
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+                const float* S2p = S2 + sx;
+                const float* S3p = S3 + sx;
+
+                __m256 _a0 = _mm256_set1_ps(alphap[0]);
+                __m256 _a1 = _mm256_set1_ps(alphap[1]);
+                __m256 _a2 = _mm256_set1_ps(alphap[2]);
+                __m256 _a3 = _mm256_set1_ps(alphap[3]);
+
+                __m256 _S00 = _mm256_load_ps(S0p - 8);
+                __m256 _S01 = _mm256_load_ps(S0p + 0);
+                __m256 _S02 = _mm256_load_ps(S0p + 8);
+                __m256 _S03 = _mm256_load_ps(S0p + 16);
+                __m256 _S10 = _mm256_load_ps(S1p - 8);
+                __m256 _S11 = _mm256_load_ps(S1p + 0);
+                __m256 _S12 = _mm256_load_ps(S1p + 8);
+                __m256 _S13 = _mm256_load_ps(S1p + 16);
+                __m256 _S20 = _mm256_load_ps(S2p - 8);
+                __m256 _S21 = _mm256_load_ps(S2p + 0);
+                __m256 _S22 = _mm256_load_ps(S2p + 8);
+                __m256 _S23 = _mm256_load_ps(S2p + 16);
+                __m256 _S30 = _mm256_load_ps(S3p - 8);
+                __m256 _S31 = _mm256_load_ps(S3p + 0);
+                __m256 _S32 = _mm256_load_ps(S3p + 8);
+                __m256 _S33 = _mm256_load_ps(S3p + 16);
+                __m256 _rows0 = _mm256_mul_ps(_S00, _a0);
+                __m256 _rows1 = _mm256_mul_ps(_S10, _a0);
+                __m256 _rows2 = _mm256_mul_ps(_S20, _a0);
+                __m256 _rows3 = _mm256_mul_ps(_S30, _a0);
+                _rows0 = _mm256_comp_fmadd_ps(_S01, _a1, _rows0);
+                _rows1 = _mm256_comp_fmadd_ps(_S11, _a1, _rows1);
+                _rows2 = _mm256_comp_fmadd_ps(_S21, _a1, _rows2);
+                _rows3 = _mm256_comp_fmadd_ps(_S31, _a1, _rows3);
+                _rows0 = _mm256_comp_fmadd_ps(_S02, _a2, _rows0);
+                _rows1 = _mm256_comp_fmadd_ps(_S12, _a2, _rows1);
+                _rows2 = _mm256_comp_fmadd_ps(_S22, _a2, _rows2);
+                _rows3 = _mm256_comp_fmadd_ps(_S32, _a2, _rows3);
+                _rows0 = _mm256_comp_fmadd_ps(_S03, _a3, _rows0);
+                _rows1 = _mm256_comp_fmadd_ps(_S13, _a3, _rows1);
+                _rows2 = _mm256_comp_fmadd_ps(_S23, _a3, _rows2);
+                _rows3 = _mm256_comp_fmadd_ps(_S33, _a3, _rows3);
+                _mm256_store_ps(rows0p + dx * 8, _rows0);
+                _mm256_store_ps(rows1p + dx * 8, _rows1);
+                _mm256_store_ps(rows2p + dx * 8, _rows2);
+                _mm256_store_ps(rows3p + dx * 8, _rows3);
+
+                alphap += 4;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        __m256 _b0 = _mm256_set1_ps(beta[0]);
+        __m256 _b1 = _mm256_set1_ps(beta[1]);
+        __m256 _b2 = _mm256_set1_ps(beta[2]);
+        __m256 _b3 = _mm256_set1_ps(beta[3]);
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* rows2p = rows2;
+        float* rows3p = rows3;
+        float* Dp = dst.row(dy);
+
+        for (int dx = 0; dx < w; dx++)
+        {
+            __m256 _rows0 = _mm256_load_ps(rows0p);
+            __m256 _rows1 = _mm256_load_ps(rows1p);
+            __m256 _rows2 = _mm256_load_ps(rows2p);
+            __m256 _rows3 = _mm256_load_ps(rows3p);
+            __m256 _D = _mm256_mul_ps(_rows0, _b0);
+            _D = _mm256_comp_fmadd_ps(_rows1, _b1, _D);
+            _D = _mm256_comp_fmadd_ps(_rows2, _b2, _D);
+            _D = _mm256_comp_fmadd_ps(_rows3, _b3, _D);
+            _mm256_store_ps(Dp, _D);
+
+            Dp += 8;
+            rows0p += 8;
+            rows1p += 8;
+            rows2p += 8;
+            rows3p += 8;
+        }
+
+        beta += 4;
+    }
+}
diff --git a/src/layer/x86/interp_bilinear.h b/src/layer/x86/interp_bilinear.h
new file mode 100644
index 000000000..d7eccb7be
--- /dev/null
+++ b/src/layer/x86/interp_bilinear.h
@@ -0,0 +1,171 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void linear_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner)
+{
+    double scale = (double)w / outw;
+    if (align_corner)
+    {
+        scale = (double)(w - 1) / (outw - 1);
+    }
+
+    for (int dx = 0; dx < outw; dx++)
+    {
+        float fx = (float)((dx + 0.5) * scale - 0.5);
+        if (align_corner)
+        {
+            fx = (float)(dx * scale);
+        }
+
+        int sx = floor(fx);
+        fx -= sx;
+
+        if (sx < 0)
+        {
+            sx = 0;
+            fx = 0.f;
+        }
+        if (sx >= w - 1)
+        {
+            sx = w - 2;
+            fx = 1.f;
+        }
+
+        xofs[dx] = sx;
+
+        alpha[dx * 2] = 1.f - fx;
+        alpha[dx * 2 + 1] = fx;
+    }
+}
+
+static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w);
+    Mat rowsbuf1(w);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+
+    int prev_sy1 = -2;
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows0_old;
+            const float* S1 = src.row(sy + 1);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            int dx = 0;
+            for (; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S1p = S1 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
+
+                alphap += 2;
+            }
+        }
+        else
+        {
+            // hresize two rows
+            const float* S0 = src.row(sy);
+            const float* S1 = src.row(sy + 1);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            int dx = 0;
+            for (; dx < w; dx++)
+            {
+                int sx = xofs[dx];
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+
+                float a0 = alphap[0];
+                float a1 = alphap[1];
+                rows0p[dx] = S0p[0] * a0 + S0p[1] * a1;
+                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;
+
+                alphap += 2;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        float b0 = beta[0];
+        float b1 = beta[1];
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* Dp = dst.row(dy);
+
+        int dx = 0;
+#if __SSE2__
+#if __AVX__
+        __m256 _b0_256 = _mm256_set1_ps(b0);
+        __m256 _b1_256 = _mm256_set1_ps(b1);
+        for (; dx + 7 < w; dx += 8)
+        {
+            __m256 _rows0 = _mm256_loadu_ps(rows0p);
+            __m256 _rows1 = _mm256_loadu_ps(rows1p);
+            __m256 _D = _mm256_mul_ps(_rows0, _b0_256);
+            _D = _mm256_comp_fmadd_ps(_rows1, _b1_256, _D);
+            _mm256_storeu_ps(Dp, _D);
+
+            Dp += 8;
+            rows0p += 8;
+            rows1p += 8;
+        }
+#endif // __AVX__
+        __m128 _b0_128 = _mm_set1_ps(b0);
+        __m128 _b1_128 = _mm_set1_ps(b1);
+        for (; dx + 3 < w; dx += 4)
+        {
+            __m128 _rows0 = _mm_loadu_ps(rows0p);
+            __m128 _rows1 = _mm_loadu_ps(rows1p);
+            __m128 _D = _mm_mul_ps(_rows0, _b0_128);
+            _D = _mm_comp_fmadd_ps(_rows1, _b1_128, _D);
+            _mm_storeu_ps(Dp, _D);
+
+            Dp += 4;
+            rows0p += 4;
+            rows1p += 4;
+        }
+#endif // __SSE2__
+        for (; dx < w; dx++)
+        {
+            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1;
+        }
+
+        beta += 2;
+    }
+}
diff --git a/src/layer/x86/interp_bilinear_pack4.h b/src/layer/x86/interp_bilinear_pack4.h
new file mode 100644
index 000000000..4f50caf74
--- /dev/null
+++ b/src/layer/x86/interp_bilinear_pack4.h
@@ -0,0 +1,123 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void resize_bilinear_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
+    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+
+    int prev_sy1 = -2;
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows0_old;
+            const float* S1 = src.row(sy + 1);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            int dx = 0;
+            for (; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S1p = S1 + sx;
+
+                __m128 _a0 = _mm_set1_ps(alphap[0]);
+                __m128 _a1 = _mm_set1_ps(alphap[1]);
+
+                __m128 _S10 = _mm_load_ps(S1p);
+                __m128 _S11 = _mm_load_ps(S1p + 4);
+                __m128 _rows1 = _mm_mul_ps(_S10, _a0);
+                _rows1 = _mm_comp_fmadd_ps(_S11, _a1, _rows1);
+                _mm_store_ps(rows1p + dx * 4, _rows1);
+
+                alphap += 2;
+            }
+        }
+        else
+        {
+            // hresize two rows
+            const float* S0 = src.row(sy);
+            const float* S1 = src.row(sy + 1);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            int dx = 0;
+            for (; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 4;
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+
+                __m128 _a0 = _mm_set1_ps(alphap[0]);
+                __m128 _a1 = _mm_set1_ps(alphap[1]);
+
+                __m128 _S00 = _mm_load_ps(S0p);
+                __m128 _S01 = _mm_load_ps(S0p + 4);
+                __m128 _S10 = _mm_load_ps(S1p);
+                __m128 _S11 = _mm_load_ps(S1p + 4);
+                __m128 _rows0 = _mm_mul_ps(_S00, _a0);
+                __m128 _rows1 = _mm_mul_ps(_S10, _a0);
+                _rows0 = _mm_comp_fmadd_ps(_S01, _a1, _rows0);
+                _rows1 = _mm_comp_fmadd_ps(_S11, _a1, _rows1);
+                _mm_store_ps(rows0p + dx * 4, _rows0);
+                _mm_store_ps(rows1p + dx * 4, _rows1);
+
+                alphap += 2;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        __m128 _b0 = _mm_set1_ps(beta[0]);
+        __m128 _b1 = _mm_set1_ps(beta[1]);
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* Dp = dst.row(dy);
+
+        for (int dx = 0; dx < w; dx++)
+        {
+            __m128 _rows0 = _mm_load_ps(rows0p);
+            __m128 _rows1 = _mm_load_ps(rows1p);
+            __m128 _D = _mm_mul_ps(_rows0, _b0);
+            _D = _mm_comp_fmadd_ps(_rows1, _b1, _D);
+            _mm_store_ps(Dp, _D);
+
+            Dp += 4;
+            rows0p += 4;
+            rows1p += 4;
+        }
+
+        beta += 2;
+    }
+}
diff --git a/src/layer/x86/interp_bilinear_pack8.h b/src/layer/x86/interp_bilinear_pack8.h
new file mode 100644
index 000000000..5199d4790
--- /dev/null
+++ b/src/layer/x86/interp_bilinear_pack8.h
@@ -0,0 +1,123 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void resize_bilinear_image_pack8(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
+{
+    int w = dst.w;
+    int h = dst.h;
+
+    // loop body
+    Mat rowsbuf0(w, (size_t)8 * 4u, 8);
+    Mat rowsbuf1(w, (size_t)8 * 4u, 8);
+    float* rows0 = rowsbuf0;
+    float* rows1 = rowsbuf1;
+
+    int prev_sy1 = -2;
+
+    for (int dy = 0; dy < h; dy++)
+    {
+        int sy = yofs[dy];
+
+        if (sy == prev_sy1)
+        {
+            // reuse all rows
+        }
+        else if (sy == prev_sy1 + 1)
+        {
+            // hresize one row
+            float* rows0_old = rows0;
+            rows0 = rows1;
+            rows1 = rows0_old;
+            const float* S1 = src.row(sy + 1);
+
+            const float* alphap = alpha;
+            float* rows1p = rows1;
+            int dx = 0;
+            for (; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 8;
+                const float* S1p = S1 + sx;
+
+                __m256 _a0 = _mm256_set1_ps(alphap[0]);
+                __m256 _a1 = _mm256_set1_ps(alphap[1]);
+
+                __m256 _S10 = _mm256_load_ps(S1p);
+                __m256 _S11 = _mm256_load_ps(S1p + 8);
+                __m256 _rows1 = _mm256_mul_ps(_S10, _a0);
+                _rows1 = _mm256_comp_fmadd_ps(_S11, _a1, _rows1);
+                _mm256_store_ps(rows1p + dx * 8, _rows1);
+
+                alphap += 2;
+            }
+        }
+        else
+        {
+            // hresize two rows
+            const float* S0 = src.row(sy);
+            const float* S1 = src.row(sy + 1);
+
+            const float* alphap = alpha;
+            float* rows0p = rows0;
+            float* rows1p = rows1;
+            int dx = 0;
+            for (; dx < w; dx++)
+            {
+                int sx = xofs[dx] * 8;
+                const float* S0p = S0 + sx;
+                const float* S1p = S1 + sx;
+
+                __m256 _a0 = _mm256_set1_ps(alphap[0]);
+                __m256 _a1 = _mm256_set1_ps(alphap[1]);
+
+                __m256 _S00 = _mm256_load_ps(S0p);
+                __m256 _S01 = _mm256_load_ps(S0p + 8);
+                __m256 _S10 = _mm256_load_ps(S1p);
+                __m256 _S11 = _mm256_load_ps(S1p + 8);
+                __m256 _rows0 = _mm256_mul_ps(_S00, _a0);
+                __m256 _rows1 = _mm256_mul_ps(_S10, _a0);
+                _rows0 = _mm256_comp_fmadd_ps(_S01, _a1, _rows0);
+                _rows1 = _mm256_comp_fmadd_ps(_S11, _a1, _rows1);
+                _mm256_store_ps(rows0p + dx * 8, _rows0);
+                _mm256_store_ps(rows1p + dx * 8, _rows1);
+
+                alphap += 2;
+            }
+        }
+
+        prev_sy1 = sy;
+
+        // vresize
+        __m256 _b0 = _mm256_set1_ps(beta[0]);
+        __m256 _b1 = _mm256_set1_ps(beta[1]);
+
+        float* rows0p = rows0;
+        float* rows1p = rows1;
+        float* Dp = dst.row(dy);
+
+        for (int dx = 0; dx < w; dx++)
+        {
+            __m256 _rows0 = _mm256_load_ps(rows0p);
+            __m256 _rows1 = _mm256_load_ps(rows1p);
+            __m256 _D = _mm256_mul_ps(_rows0, _b0);
+            _D = _mm256_comp_fmadd_ps(_rows1, _b1, _D);
+            _mm256_store_ps(Dp, _D);
+
+            Dp += 8;
+            rows0p += 8;
+            rows1p += 8;
+        }
+
+        beta += 2;
+    }
+}
diff --git a/src/layer/x86/interp_x86.cpp b/src/layer/x86/interp_x86.cpp
new file mode 100644
index 000000000..e6a958819
--- /dev/null
+++ b/src/layer/x86/interp_x86.cpp
@@ -0,0 +1,690 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "interp_x86.h"
+
+#include <math.h>
+
+#if __SSE2__
+#include <emmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif // __AVX__
+#endif // __SSE2__
+
+#include "x86_usability.h"
+
+namespace ncnn {
+
+#include "interp_bicubic.h"
+#include "interp_bilinear.h"
+
+#if __SSE2__
+#include "interp_bicubic_pack4.h"
+#include "interp_bilinear_pack4.h"
+
+#if __AVX__
+#include "interp_bicubic_pack8.h"
+#include "interp_bilinear_pack8.h"
+#endif
+#endif
+
+Interp_x86::Interp_x86()
+{
+#if __SSE2__
+    support_packing = true;
+#endif // __SSE2__
+}
+
+int Interp_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& reference_blob = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    int h = bottom_blob.h;
+    int w = bottom_blob.w;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = reference_blob.w;
+    int outh = reference_blob.h;
+
+    if (dims == 1)
+    {
+        top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+#if __SSE2__
+#if __AVX__
+        if (elempack == 8)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < w; q++)
+            {
+                Mat top_blob_c = top_blob.channel(q);
+                __m256 _v = _mm256_load_ps((const float*)bottom_blob + q * 8);
+                top_blob_c.fill(_v);
+            }
+
+            return 0;
+        }
+#endif // __AVX__
+
+        if (elempack == 4)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < w; q++)
+            {
+                Mat top_blob_c = top_blob.channel(q);
+                __m128 _v = _mm_load_ps((const float*)bottom_blob + q * 4);
+                top_blob_c.fill(_v);
+            }
+
+            return 0;
+        }
+#endif // __SSE2__
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < w; q++)
+        {
+            Mat top_blob_c = top_blob.channel(q);
+            const float v = bottom_blob[q];
+            top_blob_c.fill(v);
+        }
+
+        return 0;
+    }
+
+    if (dims == 2)
+    {
+        if (outw == w)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+
+        top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+#if __SSE2__
+#if __AVX__
+        if (elempack == 8)
+        {
+            if (resize_type == 1) // nearest
+            {
+                const float ws = outw ? w / (float)outw : 1.f / width_scale;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const float* ptr = bottom_blob.row(y);
+                    float* outptr = top_blob.row(y);
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int in_x = std::min((int)(x * ws), (w - 1));
+
+                        __m256 _p = _mm256_load_ps(ptr + in_x * 8);
+                        _mm256_store_ps(outptr, _p);
+
+                        outptr += 8;
+                    }
+                }
+            }
+
+            if (resize_type == 2) // bilinear
+            {
+                int* buf = new int[outw + outw * 2];
+
+                int* xofs = buf;
+                float* alpha = (float*)(buf + outw);
+
+                linear_coeffs(w, outw, xofs, alpha, align_corner);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const float* ptr = bottom_blob.row(y);
+                    float* outptr = top_blob.row(y);
+                    const float* alphap = alpha;
+
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int sx = xofs[x] * 8;
+                        const float* Sp = ptr + sx;
+
+                        __m256 _a0 = _mm256_set1_ps(alphap[0]);
+                        __m256 _a1 = _mm256_set1_ps(alphap[1]);
+
+                        __m256 _S0 = _mm256_load_ps(Sp);
+                        __m256 _S1 = _mm256_load_ps(Sp + 8);
+                        __m256 _p = _mm256_mul_ps(_S0, _a0);
+                        _p = _mm256_comp_fmadd_ps(_S1, _a1, _p);
+                        _mm256_store_ps(outptr, _p);
+
+                        alphap += 2;
+                        outptr += 8;
+                    }
+                }
+
+                delete[] buf;
+            }
+
+            if (resize_type == 3) // bicubic
+            {
+                int* buf = new int[outw + outw * 4];
+
+                int* xofs = buf;
+                float* alpha = (float*)(buf + outw);
+
+                cubic_coeffs(w, outw, xofs, alpha, align_corner);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const float* ptr = bottom_blob.row(y);
+                    float* outptr = top_blob.row(y);
+                    const float* alphap = alpha;
+
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int sx = xofs[x] * 8;
+                        const float* Sp = ptr + sx;
+
+                        __m256 _a0 = _mm256_set1_ps(alphap[0]);
+                        __m256 _a1 = _mm256_set1_ps(alphap[1]);
+                        __m256 _a2 = _mm256_set1_ps(alphap[2]);
+                        __m256 _a3 = _mm256_set1_ps(alphap[3]);
+
+                        __m256 _S0 = _mm256_load_ps(Sp - 8);
+                        __m256 _S1 = _mm256_load_ps(Sp + 0);
+                        __m256 _S2 = _mm256_load_ps(Sp + 8);
+                        __m256 _S3 = _mm256_load_ps(Sp + 16);
+                        __m256 _p = _mm256_mul_ps(_S0, _a0);
+                        _p = _mm256_comp_fmadd_ps(_S1, _a1, _p);
+                        _p = _mm256_comp_fmadd_ps(_S2, _a2, _p);
+                        _p = _mm256_comp_fmadd_ps(_S3, _a3, _p);
+                        _mm256_store_ps(outptr, _p);
+
+                        alphap += 4;
+                        outptr += 8;
+                    }
+                }
+
+                delete[] buf;
+            }
+
+            return 0;
+        }
+#endif // __AVX__
+
+        if (elempack == 4)
+        {
+            if (resize_type == 1) // nearest
+            {
+                const float ws = outw ? w / (float)outw : 1.f / width_scale;
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const float* ptr = bottom_blob.row(y);
+                    float* outptr = top_blob.row(y);
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int in_x = std::min((int)(x * ws), (w - 1));
+
+                        __m128 _p = _mm_load_ps(ptr + in_x * 4);
+                        _mm_store_ps(outptr, _p);
+
+                        outptr += 4;
+                    }
+                }
+            }
+
+            if (resize_type == 2) // bilinear
+            {
+                int* buf = new int[outw + outw * 2];
+
+                int* xofs = buf;
+                float* alpha = (float*)(buf + outw);
+
+                linear_coeffs(w, outw, xofs, alpha, align_corner);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const float* ptr = bottom_blob.row(y);
+                    float* outptr = top_blob.row(y);
+                    const float* alphap = alpha;
+
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int sx = xofs[x] * 4;
+                        const float* Sp = ptr + sx;
+
+                        __m128 _a0 = _mm_set1_ps(alphap[0]);
+                        __m128 _a1 = _mm_set1_ps(alphap[1]);
+
+                        __m128 _S0 = _mm_load_ps(Sp);
+                        __m128 _S1 = _mm_load_ps(Sp + 4);
+                        __m128 _p = _mm_mul_ps(_S0, _a0);
+                        _p = _mm_comp_fmadd_ps(_S1, _a1, _p);
+                        _mm_store_ps(outptr, _p);
+
+                        alphap += 2;
+                        outptr += 4;
+                    }
+                }
+
+                delete[] buf;
+            }
+
+            if (resize_type == 3) // bicubic
+            {
+                int* buf = new int[outw + outw * 4];
+
+                int* xofs = buf;
+                float* alpha = (float*)(buf + outw);
+
+                cubic_coeffs(w, outw, xofs, alpha, align_corner);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int y = 0; y < h; y++)
+                {
+                    const float* ptr = bottom_blob.row(y);
+                    float* outptr = top_blob.row(y);
+                    const float* alphap = alpha;
+
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int sx = xofs[x] * 4;
+                        const float* Sp = ptr + sx;
+
+                        __m128 _a0 = _mm_set1_ps(alphap[0]);
+                        __m128 _a1 = _mm_set1_ps(alphap[1]);
+                        __m128 _a2 = _mm_set1_ps(alphap[2]);
+                        __m128 _a3 = _mm_set1_ps(alphap[3]);
+
+                        __m128 _S0 = _mm_load_ps(Sp - 4);
+                        __m128 _S1 = _mm_load_ps(Sp + 0);
+                        __m128 _S2 = _mm_load_ps(Sp + 4);
+                        __m128 _S3 = _mm_load_ps(Sp + 8);
+                        __m128 _p = _mm_mul_ps(_S0, _a0);
+                        _p = _mm_comp_fmadd_ps(_S1, _a1, _p);
+                        _p = _mm_comp_fmadd_ps(_S2, _a2, _p);
+                        _p = _mm_comp_fmadd_ps(_S3, _a3, _p);
+                        _mm_store_ps(outptr, _p);
+
+                        alphap += 4;
+                        outptr += 4;
+                    }
+                }
+
+                delete[] buf;
+            }
+
+            return 0;
+        }
+#endif // __SSE2__
+
+        if (resize_type == 1) // nearest
+        {
+            const float ws = outw ? w / (float)outw : 1.f / width_scale;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int y = 0; y < h; y++)
+            {
+                const float* ptr = bottom_blob.row(y);
+                float* outptr = top_blob.row(y);
+                for (int x = 0; x < outw; x++)
+                {
+                    int in_x = std::min((int)(x * ws), (w - 1));
+                    *outptr++ = ptr[in_x];
+                }
+            }
+        }
+
+        if (resize_type == 2) // bilinear
+        {
+            int* buf = new int[outw + outw * 2];
+
+            int* xofs = buf;
+            float* alpha = (float*)(buf + outw);
+
+            linear_coeffs(w, outw, xofs, alpha, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int y = 0; y < h; y++)
+            {
+                const float* ptr = bottom_blob.row(y);
+                float* outptr = top_blob.row(y);
+                const float* alphap = alpha;
+
+                for (int x = 0; x < outw; x++)
+                {
+                    int sx = xofs[x];
+                    const float* Sp = ptr + sx;
+                    float a0 = alphap[0];
+                    float a1 = alphap[1];
+                    *outptr++ = Sp[0] * a0 + Sp[1] * a1;
+                    alphap += 2;
+                }
+            }
+
+            delete[] buf;
+        }
+
+        if (resize_type == 3) // bicubic
+        {
+            int* buf = new int[outw + outw * 4];
+
+            int* xofs = buf;
+            float* alpha = (float*)(buf + outw);
+
+            cubic_coeffs(w, outw, xofs, alpha, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int y = 0; y < h; y++)
+            {
+                const float* ptr = bottom_blob.row(y);
+                float* outptr = top_blob.row(y);
+                const float* alphap = alpha;
+
+                for (int x = 0; x < outw; x++)
+                {
+                    int sx = xofs[x];
+                    const float* Sp = ptr + sx;
+                    float a0 = alphap[0];
+                    float a1 = alphap[1];
+                    float a2 = alphap[2];
+                    float a3 = alphap[3];
+                    *outptr++ = Sp[-1] * a0 + Sp[0] * a1 + Sp[1] * a2 + Sp[2] * a3;
+                    alphap += 4;
+                }
+            }
+
+            delete[] buf;
+        }
+
+        return 0;
+    }
+
+    if (outw == w && outh == h)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+#if __SSE2__
+#if __AVX__
+    if (elempack == 8)
+    {
+        if (resize_type == 1) // nearest
+        {
+            const float hs = outh ? h / (float)outh : 1.f / height_scale;
+            const float ws = outw ? w / (float)outw : 1.f / width_scale;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                for (int y = 0; y < outh; y++)
+                {
+                    int in_y = std::min((int)(y * hs), (h - 1));
+
+                    const float* ptr = src.row(in_y);
+                    float* outptr = dst.row(y);
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int in_x = std::min((int)(x * ws), (w - 1));
+
+                        __m256 _p = _mm256_load_ps(ptr + in_x * 8);
+                        _mm256_store_ps(outptr, _p);
+
+                        outptr += 8;
+                    }
+                }
+            }
+        }
+
+        if (resize_type == 2) // bilinear
+        {
+            int* buf = new int[outw + outh + outw * 2 + outh * 2];
+
+            int* xofs = buf;        //new int[outw];
+            int* yofs = buf + outw; //new int[outh];
+
+            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
+            float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
+
+            linear_coeffs(w, outw, xofs, alpha, align_corner);
+            linear_coeffs(h, outh, yofs, beta, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                resize_bilinear_image_pack8(src, dst, alpha, xofs, beta, yofs);
+            }
+
+            delete[] buf;
+        }
+
+        if (resize_type == 3) // bicubic
+        {
+            int* buf = new int[outw + outh + outw * 4 + outh * 4];
+
+            int* xofs = buf;        //new int[outw];
+            int* yofs = buf + outw; //new int[outh];
+
+            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
+            float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
+
+            cubic_coeffs(w, outw, xofs, alpha, align_corner);
+            cubic_coeffs(h, outh, yofs, beta, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                resize_bicubic_image_pack8(src, dst, alpha, xofs, beta, yofs);
+            }
+
+            delete[] buf;
+        }
+
+        return 0;
+    }
+#endif // __AVX__
+
+    if (elempack == 4)
+    {
+        if (resize_type == 1) // nearest
+        {
+            const float hs = outh ? h / (float)outh : 1.f / height_scale;
+            const float ws = outw ? w / (float)outw : 1.f / width_scale;
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                for (int y = 0; y < outh; y++)
+                {
+                    int in_y = std::min((int)(y * hs), (h - 1));
+
+                    const float* ptr = src.row(in_y);
+                    float* outptr = dst.row(y);
+                    for (int x = 0; x < outw; x++)
+                    {
+                        int in_x = std::min((int)(x * ws), (w - 1));
+
+                        __m128 _p = _mm_load_ps(ptr + in_x * 4);
+                        _mm_store_ps(outptr, _p);
+
+                        outptr += 4;
+                    }
+                }
+            }
+        }
+
+        if (resize_type == 2) // bilinear
+        {
+            int* buf = new int[outw + outh + outw * 2 + outh * 2];
+
+            int* xofs = buf;        //new int[outw];
+            int* yofs = buf + outw; //new int[outh];
+
+            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
+            float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
+
+            linear_coeffs(w, outw, xofs, alpha, align_corner);
+            linear_coeffs(h, outh, yofs, beta, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                resize_bilinear_image_pack4(src, dst, alpha, xofs, beta, yofs);
+            }
+
+            delete[] buf;
+        }
+
+        if (resize_type == 3) // bicubic
+        {
+            int* buf = new int[outw + outh + outw * 4 + outh * 4];
+
+            int* xofs = buf;        //new int[outw];
+            int* yofs = buf + outw; //new int[outh];
+
+            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
+            float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
+
+            cubic_coeffs(w, outw, xofs, alpha, align_corner);
+            cubic_coeffs(h, outh, yofs, beta, align_corner);
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const Mat src = bottom_blob.channel(q);
+                Mat dst = top_blob.channel(q);
+
+                resize_bicubic_image_pack4(src, dst, alpha, xofs, beta, yofs);
+            }
+
+            delete[] buf;
+        }
+
+        return 0;
+    }
+#endif // __SSE2__
+
+    if (resize_type == 1) // nearest
+    {
+        const float hs = outh ? h / (float)outh : 1.f / height_scale;
+        const float ws = outw ? w / (float)outw : 1.f / width_scale;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            for (int y = 0; y < outh; y++)
+            {
+                int in_y = std::min((int)(y * hs), (h - 1));
+
+                const float* ptr = src.row(in_y);
+                float* outptr = dst.row(y);
+                for (int x = 0; x < outw; x++)
+                {
+                    int in_x = std::min((int)(x * ws), (w - 1));
+                    *outptr++ = ptr[in_x];
+                }
+            }
+        }
+    }
+
+    if (resize_type == 2) // bilinear
+    {
+        int* buf = new int[outw + outh + outw * 2 + outh * 2];
+
+        int* xofs = buf;        //new int[outw];
+        int* yofs = buf + outw; //new int[outh];
+
+        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
+        float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];
+
+        linear_coeffs(w, outw, xofs, alpha, align_corner);
+        linear_coeffs(h, outh, yofs, beta, align_corner);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            resize_bilinear_image(src, dst, alpha, xofs, beta, yofs);
+        }
+
+        delete[] buf;
+    }
+
+    if (resize_type == 3) // bicubic
+    {
+        int* buf = new int[outw + outh + outw * 4 + outh * 4];
+
+        int* xofs = buf;        //new int[outw];
+        int* yofs = buf + outw; //new int[outh];
+
+        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
+        float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];
+
+        cubic_coeffs(w, outw, xofs, alpha, align_corner);
+        cubic_coeffs(h, outh, yofs, beta, align_corner);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const Mat src = bottom_blob.channel(q);
+            Mat dst = top_blob.channel(q);
+
+            resize_bicubic_image(src, dst, alpha, xofs, beta, yofs);
+        }
+
+        delete[] buf;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/x86/interp_x86.h b/src/layer/x86/interp_x86.h
new file mode 100644
index 000000000..6f91b950e
--- /dev/null
+++ b/src/layer/x86/interp_x86.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_INTERP_X86_H
+#define LAYER_INTERP_X86_H
+
+#include "interp.h"
+
+namespace ncnn {
+
+class Interp_x86 : virtual public Interp
+{
+public:
+    Interp_x86();
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_INTERP_X86_H