fix(dnn/arm): fix read invalid data in arm kernel

GitOrigin-RevId: f1c4cae667
5 years ago · 343335932a
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s1.h
@@ -24,82 +24,75 @@ using namespace megdnn;
 using namespace arm_common;
 namespace {

 template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T,
          typename T2, typename T3, typename T4>
 template <int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w,
          typename T, typename T2, typename T3, typename T4>
 struct ShiftCalHelper {
    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight);
 };

 template <int src_idx, int weight_idx, typename T, typename T2, typename T3,
          typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 2, 8, T, T2, T3, T4> {
    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step, lane)                                             \
    c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane],      \
                                 src[(step + src_idx) % 8], lane); \
    c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane],      \
                                 src[(step + src_idx) % 8], lane);

        UNROLL_CALL_RAW(8, cb, 0);
        UNROLL_CALL_RAW(8, cb, 1);
        UNROLL_CALL_RAW(8, cb, 2);
        UNROLL_CALL_RAW(8, cb, 3);
 #undef cb
    }
 };
 template <int src_idx, int weight_idx, typename T, typename T2, typename T3,
          typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 2, 4, T, T2, T3, T4> {
    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step, lane)                                             \
    c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane],      \
                                 src[(step + src_idx) % 4], lane); \
    c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane],      \
                                 src[(step + src_idx) % 4], lane);

        UNROLL_CALL_RAW(4, cb, 0);
        UNROLL_CALL_RAW(4, cb, 1);
        UNROLL_CALL_RAW(4, cb, 2);
        UNROLL_CALL_RAW(4, cb, 3);
 #undef cb
    }
 template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T,
          typename T2, typename T3, typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, 0, T, T2, T3, T4> {
    static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {}
 };
 template <int src_idx, int weight_idx, typename T, typename T2, typename T3,
          typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 1, 8, T, T2, T3, T4> {
    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step, lane)                                        \
    c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
                                 src[(step + src_idx) % 8], lane);

        UNROLL_CALL_RAW(8, cb, 0);
        UNROLL_CALL_RAW(8, cb, 1);
        UNROLL_CALL_RAW(8, cb, 2);
        UNROLL_CALL_RAW(8, cb, 3);
 #undef cb
    }
 };
 template <int src_idx, int weight_idx, typename T, typename T2, typename T3,
          typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 1, 4, T, T2, T3, T4> {
    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step, lane)                                        \
    c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
                                 src[(step + src_idx) % 4], lane);
 #define cb2(step, lane, ow_block)                                         \
    c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane],             \
                                 src[(step + src_idx) % ow_block], lane); \
    c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane],             \
                                 src[(step + src_idx) % ow_block], lane);

        UNROLL_CALL_RAW(4, cb, 0);
        UNROLL_CALL_RAW(4, cb, 1);
        UNROLL_CALL_RAW(4, cb, 2);
        UNROLL_CALL_RAW(4, cb, 3);
 #define cb(step, lane, ow_block)                              \
    c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
                                 src[(step + src_idx) % ow_block], lane);

 #define SHIFT_CAL_HELPER(ow_block, remain_w)                                 \
    template <int src_idx, int weight_idx, typename T, typename T2,          \
              typename T3, typename T4>                                      \
    struct ShiftCalHelper<src_idx, weight_idx, 2, ow_block, remain_w, T, T2, \
                          T3, T4> {                                          \
        static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {   \
            UNROLL_CALL_RAW(remain_w, cb2, 0, ow_block);                     \
            UNROLL_CALL_RAW(remain_w, cb2, 1, ow_block);                     \
            UNROLL_CALL_RAW(remain_w, cb2, 2, ow_block);                     \
            UNROLL_CALL_RAW(remain_w, cb2, 3, ow_block);                     \
        }                                                                    \
    };                                                                       \
    template <int src_idx, int weight_idx, typename T, typename T2,          \
              typename T3, typename T4>                                      \
    struct ShiftCalHelper<src_idx, weight_idx, 1, ow_block, remain_w, T, T2, \
                          T3, T4> {                                          \
        static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {   \
            UNROLL_CALL_RAW(remain_w, cb, 0, ow_block);                      \
            UNROLL_CALL_RAW(remain_w, cb, 1, ow_block);                      \
            UNROLL_CALL_RAW(remain_w, cb, 2, ow_block);                      \
            UNROLL_CALL_RAW(remain_w, cb, 3, ow_block);                      \
        }                                                                    \
    };

 SHIFT_CAL_HELPER(8, 1);
 SHIFT_CAL_HELPER(8, 2);
 SHIFT_CAL_HELPER(8, 3);
 SHIFT_CAL_HELPER(8, 4);
 SHIFT_CAL_HELPER(8, 5);
 SHIFT_CAL_HELPER(8, 6);
 SHIFT_CAL_HELPER(8, 7);
 SHIFT_CAL_HELPER(8, 8);

 SHIFT_CAL_HELPER(4, 1);
 SHIFT_CAL_HELPER(4, 2);
 SHIFT_CAL_HELPER(4, 3);
 SHIFT_CAL_HELPER(4, 4);

 #undef SHIFT_CAL_HELPER
 #undef cb
    }
 };
 #undef cb2

 template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T,
          typename T2, typename T3>
 template <int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w,
          typename T, typename T2, typename T3>
 MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) {
    ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, T, T2, T3, int>::impl(
            c, src, weight);
    ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, remain_w, T, T2, T3,
                   int>::impl(c, src, weight);
 };
 template <int oc>
 struct OCHelper {
@@ -151,7 +144,7 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
        const int ld_src_iw = iw * oc_step;
        constexpr int c_dim = OCHelper<oc_block>::val;
        float32x4_t c[c_dim][ow_block];
        init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
            const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
@@ -162,11 +155,11 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
                                                                 0);
                load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
                src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step);
                load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
                src_ptr += ld_src_iw;
                weight_ptr += ld_weight_fh;
            }
@@ -196,7 +189,7 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
        const int ld_src_iw = iw * oc_step;
        constexpr int c_dim = OCHelper<oc_block>::val;
        float32x4_t c[c_dim][ow_block];
        init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
            const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
@@ -207,15 +200,15 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
                                                                 0);
                load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
                src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step);
                load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
                src[1] = vld1q_f32(src_ptr + (ow_block + 1) * ic_step);
                load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<2, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);
                src_ptr += ld_src_iw;
                weight_ptr += ld_weight_fh;
            }
@@ -244,7 +237,7 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> {
        const int ld_src_iw = iw * oc_step;
        constexpr int c_dim = OCHelper<oc_block>::val;
        float32x4_t c[c_dim][ow_block];
        init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
            const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
@@ -255,27 +248,27 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> {
                                                                 0);
                load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);

                src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step);
                load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);

                src[1] = vld1q_f32(src_ptr + (ow_block + 1) * ic_step);
                load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<2, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);

                src[2] = vld1q_f32(src_ptr + (ow_block + 2) * ic_step);
                load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<3, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight);

                src[3] = vld1q_f32(src_ptr + (ow_block + 3) * ic_step);
                load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<4, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight);
                src_ptr += ld_src_iw;
                weight_ptr += ld_weight_fh;
            }
@@ -305,7 +298,7 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> {
        const int ld_src_iw = iw * oc_step;
        constexpr int c_dim = OCHelper<oc_block>::val;
        float32x4_t c[c_dim][ow_block];
        init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
            const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
@@ -316,37 +309,37 @@ struct KerNeonXXs1Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> {
                                                                 0);
                load_helper<ic_step, 0, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);

                src[0] = vld1q_f32(src_ptr + (ow_block)*ic_step);
                load_helper<ic_step, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);

                src[1] = vld1q_f32(src_ptr + (ow_block + 1) * ic_step);
                load_helper<ic_step, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<2, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);

                src[2] = vld1q_f32(src_ptr + (ow_block + 2) * ic_step);
                load_helper<ic_step, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<3, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight);

                src[3] = vld1q_f32(src_ptr + (ow_block + 3) * ic_step);
                load_helper<ic_step, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<4, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<4, 0, c_dim, ow_block, remain_w>(c, src, weight);

                src[4] = vld1q_f32(src_ptr + (ow_block + 4) * ic_step);
                load_helper<ic_step, 5 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<5, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<5, 0, c_dim, ow_block, remain_w>(c, src, weight);

                src[5] = vld1q_f32(src_ptr + (ow_block + 5) * ic_step);
                load_helper<ic_step, 6 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<6, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<6, 0, c_dim, ow_block, remain_w>(c, src, weight);
                src_ptr += ld_src_iw;
                weight_ptr += ld_weight_fh;
            }
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw44_kern_common_s2.h
@@ -24,83 +24,77 @@ using namespace megdnn;
 using namespace arm_common;
 namespace {

 template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T,
          typename T2, typename T3, typename T4>
 template <int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w,
          typename T, typename T2, typename T3, typename T4>
 struct ShiftCalHelper {
    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight);
 };

 template <int src_idx, int weight_idx, typename T, typename T2, typename T3,
          typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 2, 8, T, T2, T3, T4> {
    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step, lane)                                             \
    c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane],      \
                                 src[(step + src_idx) % 8], lane); \
    c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane],      \
                                 src[(step + src_idx) % 8], lane);

        UNROLL_CALL_RAW(8, cb, 0);
        UNROLL_CALL_RAW(8, cb, 1);
        UNROLL_CALL_RAW(8, cb, 2);
        UNROLL_CALL_RAW(8, cb, 3);
 #undef cb
    }
 };
 template <int src_idx, int weight_idx, typename T, typename T2, typename T3,
          typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 2, 4, T, T2, T3, T4> {
    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step, lane)                                             \
    c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane],      \
                                 src[(step + src_idx) % 4], lane); \
    c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane],      \
                                 src[(step + src_idx) % 4], lane);

        UNROLL_CALL_RAW(4, cb, 0);
        UNROLL_CALL_RAW(4, cb, 1);
        UNROLL_CALL_RAW(4, cb, 2);
        UNROLL_CALL_RAW(4, cb, 3);
 #undef cb
    }
 template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T,
          typename T2, typename T3, typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, 0, T, T2, T3, T4> {
    static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {}
 };
 template <int src_idx, int weight_idx, typename T, typename T2, typename T3,
          typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 1, 8, T, T2, T3, T4> {
    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step, lane)                                        \
    c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
                                 src[(step + src_idx) % 8], lane);

        UNROLL_CALL_RAW(8, cb, 0);
        UNROLL_CALL_RAW(8, cb, 1);
        UNROLL_CALL_RAW(8, cb, 2);
        UNROLL_CALL_RAW(8, cb, 3);
 #undef cb
    }
 };
 template <int src_idx, int weight_idx, typename T, typename T2, typename T3,
          typename T4>
 struct ShiftCalHelper<src_idx, weight_idx, 1, 4, T, T2, T3, T4> {
    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step, lane)                                        \
    c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
                                 src[(step + src_idx) % 4], lane);
 #define cb2(step, lane, ow_block)                                         \
    c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane],             \
                                 src[(step + src_idx) % ow_block], lane); \
    c[1][step] = vfmaq_laneq_f32(c[1][step], weight[1][lane],             \
                                 src[(step + src_idx) % ow_block], lane);

        UNROLL_CALL_RAW(4, cb, 0);
        UNROLL_CALL_RAW(4, cb, 1);
        UNROLL_CALL_RAW(4, cb, 2);
        UNROLL_CALL_RAW(4, cb, 3);
 #define cb(step, lane, ow_block)                              \
    c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][lane], \
                                 src[(step + src_idx) % ow_block], lane);

 #define SHIFT_CAL_HELPER(ow_block, remain_w)                                 \
    template <int src_idx, int weight_idx, typename T, typename T2,          \
              typename T3, typename T4>                                      \
    struct ShiftCalHelper<src_idx, weight_idx, 2, ow_block, remain_w, T, T2, \
                          T3, T4> {                                          \
        static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {   \
            UNROLL_CALL_RAW(remain_w, cb2, 0, ow_block);                     \
            UNROLL_CALL_RAW(remain_w, cb2, 1, ow_block);                     \
            UNROLL_CALL_RAW(remain_w, cb2, 2, ow_block);                     \
            UNROLL_CALL_RAW(remain_w, cb2, 3, ow_block);                     \
        }                                                                    \
    };                                                                       \
    template <int src_idx, int weight_idx, typename T, typename T2,          \
              typename T3, typename T4>                                      \
    struct ShiftCalHelper<src_idx, weight_idx, 1, ow_block, remain_w, T, T2, \
                          T3, T4> {                                          \
        static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {   \
            UNROLL_CALL_RAW(remain_w, cb, 0, ow_block);                      \
            UNROLL_CALL_RAW(remain_w, cb, 1, ow_block);                      \
            UNROLL_CALL_RAW(remain_w, cb, 2, ow_block);                      \
            UNROLL_CALL_RAW(remain_w, cb, 3, ow_block);                      \
        }                                                                    \
    };

 SHIFT_CAL_HELPER(8, 1);
 SHIFT_CAL_HELPER(8, 2);
 SHIFT_CAL_HELPER(8, 3);
 SHIFT_CAL_HELPER(8, 4);
 SHIFT_CAL_HELPER(8, 5);
 SHIFT_CAL_HELPER(8, 6);
 SHIFT_CAL_HELPER(8, 7);
 SHIFT_CAL_HELPER(8, 8);

 SHIFT_CAL_HELPER(4, 1);
 SHIFT_CAL_HELPER(4, 2);
 SHIFT_CAL_HELPER(4, 3);
 SHIFT_CAL_HELPER(4, 4);

 #undef SHIFT_CAL_HELPER
 #undef cb
    }
 };
 #undef cb2

 template <int src_idx, int weight_idx, int c_dim, int ow_block, typename T,
          typename T2, typename T3>
 template <int src_idx, int weight_idx, int c_dim, int ow_block, int remain_w,
          typename T, typename T2, typename T3>
 MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) {
    ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, T, T2, T3, int>::impl(
            c, src, weight);
    ShiftCalHelper<src_idx, weight_idx, c_dim, ow_block, remain_w, T, T2, T3,
                   int>::impl(c, src, weight);
 };

 template <int oc>
 struct OCHelper {
 public:
@@ -151,7 +145,7 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
        const int ld_src_iw = iw * oc_step;
        constexpr int c_dim = OCHelper<oc_block>::val;
        float32x4_t c[c_dim][ow_block];
        init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
@@ -163,13 +157,13 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
            load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0);
            load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr,
                                                         ld_weight_oc);
            cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
            cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);

            load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd,
                                                             0);
            load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr, ld_weight_oc);
            cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
            cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
            src_ptr += ld_src_iw;
            src_ptr_odd += ld_src_iw;
            weight_ptr += ld_weight_fh;
@@ -177,13 +171,13 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 2, oc_block, ow_block> {
            load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0);
            load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr,
                                                         ld_weight_oc);
            cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
            cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);

            load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd,
                                                             0);
            load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr, ld_weight_oc);
            cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
            cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
            src_ptr += ld_src_iw;
            src_ptr_odd += ld_src_iw;
            weight_ptr += ld_weight_fh;
@@ -213,7 +207,7 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
        const int ld_src_iw = iw * oc_step;
        constexpr int c_dim = OCHelper<oc_block>::val;
        float32x4_t c[c_dim][ow_block];
        init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);
        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
            const float* src_ptr_odd = src_ptr_odd_origin + ic_idx * ld_src_ic;
@@ -224,18 +218,18 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
            load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0);
            load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr,
                                                         ld_weight_oc);
            cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
            cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);

            src[0] = vld1q_f32(src_ptr + ow_block * simd_len);
            load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr, ld_weight_oc);
            cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
            cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);

            load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd,
                                                             0);
            load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr, ld_weight_oc);
            cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
            cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
            src_ptr += ld_src_iw;
            src_ptr_odd += ld_src_iw;
            weight_ptr += ld_weight_fh;
@@ -243,17 +237,17 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
            load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0);
            load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr,
                                                         ld_weight_oc);
            cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
            cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
            src[0] = vld1q_f32(src_ptr + ow_block * simd_len);
            load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr, ld_weight_oc);
            cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
            cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);

            load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd,
                                                             0);
            load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr, ld_weight_oc);
            cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
            cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
            src_ptr += ld_src_iw;
            src_ptr_odd += ld_src_iw;
            weight_ptr += ld_weight_fh;
@@ -261,18 +255,18 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 3, oc_block, ow_block> {
            load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr, 0);
            load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr,
                                                         ld_weight_oc);
            cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
            cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
            src[0] = vld1q_f32(src_ptr + ow_block * simd_len);

            load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr, ld_weight_oc);
            cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
            cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);

            load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(src, src_ptr_odd,
                                                             0);
            load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr, ld_weight_oc);
            cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
            cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
            src_ptr += ld_src_iw;
            src_ptr_odd += ld_src_iw;
            weight_ptr += ld_weight_fh;
@@ -302,7 +296,7 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> {
        const int ld_src_iw = iw * oc_step;
        constexpr int c_dim = OCHelper<oc_block>::val;
        float32x4_t c[c_dim][ow_block];
        init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
@@ -316,25 +310,25 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 5, oc_block, ow_block> {
                                                                 0);
                load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr,
                                                             ld_weight_oc);
                cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
                src[0] = vld1q_f32(src_ptr + ow_block * simd_len);
                load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
                src[1] = vld1q_f32(src_ptr + (ow_block + 1) * simd_len);
                load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<2, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);
                // odd element
                load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(
                        src, src_ptr_odd, 0);
                load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
                src[0] = vld1q_f32(src_ptr_odd + ow_block * simd_len);
                load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);

                src_ptr += ld_src_iw;
                src_ptr_odd += ld_src_iw;
@@ -371,7 +365,7 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> {
        const int ld_src_iw = iw * oc_step;
        constexpr int c_dim = OCHelper<oc_block>::val;
        float32x4_t c[c_dim][ow_block];
        init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            const float* src_ptr = src_ptr_origin + ic_idx * ld_src_ic;
@@ -385,33 +379,33 @@ struct KerNeonXXs2Nchw44FP32<bias_mode, Op, remain_w, 7, oc_block, ow_block> {
                                                                 0);
                load_helper<4, 0, oc_step, c_dim, Vld1q_f32>(weight, weight_ptr,
                                                             ld_weight_oc);
                cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
                src[0] = vld1q_f32(src_ptr + ow_block * simd_len);
                load_helper<4, 2 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
                src[1] = vld1q_f32(src_ptr + (ow_block + 1) * simd_len);
                load_helper<4, 4 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<2, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);
                src[2] = vld1q_f32(src_ptr + (ow_block + 2) * simd_len);
                load_helper<4, 6 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<3, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<3, 0, c_dim, ow_block, remain_w>(c, src, weight);
                // odd element
                load_helper<ow_block, 0, simd_len, 0, Vld1q_f32>(
                        src, src_ptr_odd, 0);
                load_helper<4, 1 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<0, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<0, 0, c_dim, ow_block, remain_w>(c, src, weight);
                src[0] = vld1q_f32(src_ptr_odd + ow_block * simd_len);
                load_helper<4, 3 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<1, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<1, 0, c_dim, ow_block, remain_w>(c, src, weight);
                src[1] = vld1q_f32(src_ptr_odd + (ow_block + 1) * simd_len);
                load_helper<4, 5 * ld_weight, oc_step, c_dim, Vld1q_f32>(
                        weight, weight_ptr, ld_weight_oc);
                cal_helper<2, 0, c_dim, ow_block>(c, src, weight);
                cal_helper<2, 0, c_dim, ow_block, remain_w>(c, src, weight);

                src_ptr += ld_src_iw;
                src_ptr_odd += ld_src_iw;
--- a/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h
+++ b/dnn/src/arm_common/conv_bias/fp32/direct_kernels/f32_direct_nchw_nchw44_kern_common.h
@@ -39,16 +39,18 @@ namespace {
 *\tparam T2 is type of src regs
 *\tparam T3 is type of weight regs
 */
 template <int src_idx, int weight_idx, int c_dim, int stride, typename T,
          typename T2, typename T3>
 template <int src_idx, int weight_idx, int c_dim, int stride, int remain_w,
          typename T, typename T2, typename T3>
 struct ShiftCalHelper {
    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight);
 };

 template <int src_idx, int weight_idx, int stride, typename T, typename T2,
          typename T3>
 struct ShiftCalHelper<src_idx, weight_idx, 2, stride, T, T2, T3> {
    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 template <int src_idx, int weight_idx, int c_dim, int stride, typename T,
          typename T2, typename T3>
 struct ShiftCalHelper<src_idx, weight_idx, c_dim, stride, 0, T, T2, T3> {
    static MEGDNN_ALWAYS_INLINE void impl(T&, T2&, T3&) {}
 };

 #define cb(step)                                                     \
    c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][weight_idx],  \
                                 src[(step * stride + src_idx) / 4], \
@@ -57,29 +59,47 @@ struct ShiftCalHelper<src_idx, weight_idx, 2, stride, T, T2, T3> {
                                 src[(step * stride + src_idx) / 4], \
                                 (step * stride + src_idx) % 4);

        UNROLL_CALL_RAW(8, cb);
 #undef cb
    }
 };
 template <int src_idx, int weight_idx, int stride, typename T, typename T2,
          typename T3>
 struct ShiftCalHelper<src_idx, weight_idx, 1, stride, T, T2, T3> {
    static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {
 #define cb(step)                                                     \
 #define cb2(step)                                                    \
    c[0][step] = vfmaq_laneq_f32(c[0][step], weight[0][weight_idx],  \
                                 src[(step * stride + src_idx) / 4], \
                                 (step * stride + src_idx) % 4);

        UNROLL_CALL_RAW(8, cb);
 #define SHIFT_CAL_HELPER(ow_remain)                                         \
    template <int src_idx, int weight_idx, int stride, typename T,          \
              typename T2, typename T3>                                     \
    struct ShiftCalHelper<src_idx, weight_idx, 2, stride, ow_remain, T, T2, \
                          T3> {                                             \
        static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {  \
            UNROLL_CALL_RAW(ow_remain, cb);                                 \
        }                                                                   \
    };                                                                      \
    template <int src_idx, int weight_idx, int stride, typename T,          \
              typename T2, typename T3>                                     \
    struct ShiftCalHelper<src_idx, weight_idx, 1, stride, ow_remain, T, T2, \
                          T3> {                                             \
        static MEGDNN_ALWAYS_INLINE void impl(T& c, T2& src, T3& weight) {  \
            UNROLL_CALL_RAW(ow_remain, cb2);                                \
        }                                                                   \
    };

 SHIFT_CAL_HELPER(1)
 SHIFT_CAL_HELPER(2)
 SHIFT_CAL_HELPER(3)
 SHIFT_CAL_HELPER(4)
 SHIFT_CAL_HELPER(5)
 SHIFT_CAL_HELPER(6)
 SHIFT_CAL_HELPER(7)
 SHIFT_CAL_HELPER(8)

 #undef SHIFT_CAL_HELPER
 #undef cb
    }
 };
 #undef cb2

 template <int src_idx, int weight_idx, int c_dim, int stride, typename T,
          typename T2, typename T3>
 template <int src_idx, int weight_idx, int c_dim, int stride, int remain_w,
          typename T, typename T2, typename T3>
 MEGDNN_ALWAYS_INLINE void cal_helper(T& c, T2& src, T3& weight) {
    ShiftCalHelper<src_idx, weight_idx, c_dim, stride, T, T2, T3>::impl(c, src,
                                                                        weight);
    ShiftCalHelper<src_idx, weight_idx, c_dim, stride, remain_w, T, T2,
                   T3>::impl(c, src, weight);
 };
 enum CpuTag {
    DEFAULT_CPU_TAG = 0,
@@ -134,7 +154,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 7, oc_block, stride,
        const int ld_src_ic = ih * iw;
        constexpr int c_dim = OCHelper<oc_block>::val;
        float32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            float32x4_t src[src_reg_size];
@@ -145,13 +165,13 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 7, oc_block, stride,
            src, src_ptr + step * iw, 0);                            \
    load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(          \
            weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \
    cal_helper<0, 0, c_dim, stride>(c, src, weight);                 \
    cal_helper<1, 1, c_dim, stride>(c, src, weight);                 \
    cal_helper<2, 2, c_dim, stride>(c, src, weight);                 \
    cal_helper<3, 3, c_dim, stride>(c, src, weight);                 \
    cal_helper<4, 4, c_dim, stride>(c, src, weight);                 \
    cal_helper<5, 5, c_dim, stride>(c, src, weight);                 \
    cal_helper<6, 6, c_dim, stride>(c, src, weight);
    cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<5, 5, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<6, 6, c_dim, stride, remain_w>(c, src, weight);

            UNROLL_CALL_RAW(7, KERNEL_CB)
 #undef KERNEL_CB
@@ -185,7 +205,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 5, oc_block, stride,
        const int ld_src_ic = ih * iw;
        constexpr int c_dim = OCHelper<oc_block>::val;
        float32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            float32x4_t src[src_reg_size];
@@ -196,11 +216,11 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 5, oc_block, stride,
            src, src_ptr + step * iw, 0);                            \
    load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(          \
            weight, weight_ptr + step * ld_weight_fw, ld_weight_oc); \
    cal_helper<0, 0, c_dim, stride>(c, src, weight);                 \
    cal_helper<1, 1, c_dim, stride>(c, src, weight);                 \
    cal_helper<2, 2, c_dim, stride>(c, src, weight);                 \
    cal_helper<3, 3, c_dim, stride>(c, src, weight);                 \
    cal_helper<4, 4, c_dim, stride>(c, src, weight);
    cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<3, 3, c_dim, stride, remain_w>(c, src, weight);       \
    cal_helper<4, 4, c_dim, stride, remain_w>(c, src, weight);
            UNROLL_CALL_RAW(5, KERNEL_CB)
 #undef KERNEL_CB

@@ -233,7 +253,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block, stride,
        const int ld_src_ic = ih * iw;
        constexpr int c_dim = OCHelper<oc_block>::val;
        float32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            float32x4_t src[src_reg_size];
@@ -243,27 +263,27 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 3, oc_block, stride,
                                                                 0);
            load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr, ld_weight_oc);
            cal_helper<0, 0, c_dim, stride>(c, src, weight);
            cal_helper<1, 1, c_dim, stride>(c, src, weight);
            cal_helper<2, 2, c_dim, stride>(c, src, weight);
            cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
            cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);
            cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);

            // row 1
            load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>(
                    src, src_ptr + iw, 0);
            load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr + 1 * ld_weight_fw, ld_weight_oc);
            cal_helper<0, 0, c_dim, stride>(c, src, weight);
            cal_helper<1, 1, c_dim, stride>(c, src, weight);
            cal_helper<2, 2, c_dim, stride>(c, src, weight);
            cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
            cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);
            cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);

            // row 2
            load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>(
                    src, src_ptr + 2 * iw, 0);
            load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr + 2 * ld_weight_fw, ld_weight_oc);
            cal_helper<0, 0, c_dim, stride>(c, src, weight);
            cal_helper<1, 1, c_dim, stride>(c, src, weight);
            cal_helper<2, 2, c_dim, stride>(c, src, weight);
            cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
            cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);
            cal_helper<2, 2, c_dim, stride, remain_w>(c, src, weight);

            src_ptr += ld_src_ic;
            weight_ptr += ld_weight_ic;
@@ -634,7 +654,7 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 2, oc_block, stride,
        const int ld_src_ic = ih * iw;
        constexpr int c_dim = OCHelper<oc_block>::val;
        float32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            float32x4_t src[src_reg_size];
@@ -644,16 +664,16 @@ struct KerNeonXXs2NchwNchw44FP32<bias_mode, Op, remain_w, 2, oc_block, stride,
                                                                 0);
            load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr, ld_weight_oc);
            cal_helper<0, 0, c_dim, stride>(c, src, weight);
            cal_helper<1, 1, c_dim, stride>(c, src, weight);
            cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
            cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);

            // row 1
            load_helper<src_reg_size, 0, simd_len, 0, Vld1q_f32>(
                    src, src_ptr + iw, 0);
            load_helper<filter_size, 0, oc_step, c_dim, Vld1q_f32>(
                    weight, weight_ptr + 1 * ld_weight_fw, ld_weight_oc);
            cal_helper<0, 0, c_dim, stride>(c, src, weight);
            cal_helper<1, 1, c_dim, stride>(c, src, weight);
            cal_helper<0, 0, c_dim, stride, remain_w>(c, src, weight);
            cal_helper<1, 1, c_dim, stride, remain_w>(c, src, weight);

            src_ptr += ld_src_ic;
            weight_ptr += ld_weight_ic;
--- a/dnn/src/arm_common/conv_bias/fp32/do_conv_stride1.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/do_conv_stride1.cpp
@@ -6,14 +6,15 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include <algorithm>

 #include "src/arm_common/conv_bias/fp32/do_conv_stride1.h"
 #include "src/arm_common/simd_macro/neon_helper.h"
 #include "src/arm_common/conv_bias/postprocess_helper.h"
 #include "src/arm_common/simd_macro/neon_helper.h"

 #include "midout.h"

@@ -27,10 +28,9 @@ using namespace conv_stride1;
 using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam;
 using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;


 void conv_stride1::do_conv_2x2_stride1(const float* src, const float* filter, float* dst,
                         size_t IH, size_t IW, size_t OH, size_t OW,
                         size_t IC) {
 void conv_stride1::do_conv_2x2_stride1(const float* src, const float* filter,
                                       float* dst, size_t IH, size_t IW,
                                       size_t OH, size_t OW, size_t IC) {
    const size_t tail_step = IW - OW;
    //! unroll of 2
    size_t ic = 0;
@@ -143,9 +143,9 @@ void conv_stride1::do_conv_2x2_stride1(const float* src, const float* filter, fl
    }
 }

 void conv_stride1::do_conv_3x3_stride1(const float* src, const float* filter, float* dst,
                         size_t IH, size_t IW, size_t OH, size_t OW,
                         size_t IC) {
 void conv_stride1::do_conv_3x3_stride1(const float* src, const float* filter,
                                       float* dst, size_t IH, size_t IW,
                                       size_t OH, size_t OW, size_t IC) {
    const size_t tail_step = IW - OW;

    rep(ic, IC) {
@@ -193,7 +193,7 @@ void conv_stride1::do_conv_3x3_stride1(const float* src, const float* filter, fl
                MEGDNN_SIMD_TYPE _r22 = MEGDNN_SIMD_EXT(_r20, _r20n, 2);

                MEGDNN_SIMD_TYPE _r30 = MEGDNN_SIMD_LOADU(r3);
                MEGDNN_SIMD_TYPE _r30n = MEGDNN_SIMD_LOADU(r3 + 4);
                MEGDNN_SIMD_TYPE _r30n = MEGDNN_SIMD_LOADU_2(r3 + 4);
                MEGDNN_SIMD_TYPE _r31 = MEGDNN_SIMD_EXT(_r30, _r30n, 1);
                MEGDNN_SIMD_TYPE _r32 = MEGDNN_SIMD_EXT(_r30, _r30n, 2);

@@ -290,9 +290,9 @@ void conv_stride1::do_conv_3x3_stride1(const float* src, const float* filter, fl
    }
 }

 void conv_stride1::do_conv_5x5_stride1(const float* src, const float* filter, float* dst,
                         size_t IH, size_t IW, size_t OH, size_t OW,
                         size_t IC) {
 void conv_stride1::do_conv_5x5_stride1(const float* src, const float* filter,
                                       float* dst, size_t IH, size_t IW,
                                       size_t OH, size_t OW, size_t IC) {
    const size_t tail_step = IW - OW;

    rep(ic, IC) {
@@ -530,9 +530,9 @@ void conv_stride1::do_conv_5x5_stride1(const float* src, const float* filter, fl
    }
 }

 void conv_stride1::do_conv_7x7_stride1(const float* src, const float* filter, float* dst,
                         size_t IH, size_t IW, size_t OH, size_t OW,
                         size_t IC) {
 void conv_stride1::do_conv_7x7_stride1(const float* src, const float* filter,
                                       float* dst, size_t IH, size_t IW,
                                       size_t OH, size_t OW, size_t IC) {
    const size_t tail_step = IW - OW;

    rep(ic, IC) {
@@ -688,7 +688,7 @@ void conv_stride1::do_conv_7x7_stride1(const float* src, const float* filter, fl
                _sum = MEGDNN_SIMD_FMA_LANE(_sum, _r56, _k39404142, 2);

                MEGDNN_SIMD_TYPE _k42434445 = MEGDNN_SIMD_LOADU(k6);
                MEGDNN_SIMD_TYPE _k46474849 = MEGDNN_SIMD_LOADU(k6 + 4);
                MEGDNN_SIMD_TYPE _k46474849 = MEGDNN_SIMD_LOADU_3(k6 + 4);

                MEGDNN_SIMD_TYPE _r60 = MEGDNN_SIMD_LOADU(r6);
                MEGDNN_SIMD_TYPE _r64 = MEGDNN_SIMD_LOADU(r6 + 4);
--- a/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw44_algo.cpp
+++ b/dnn/src/arm_common/conv_bias/fp32/f32_direct_nchw44_algo.cpp
@@ -126,7 +126,8 @@ static void do_conv_kern(const WorkspaceBundle& bundle,
                                    ? oh_idx * oh_block * ow * pack_c
                                    : oc_idx;
    const float* bptr =
            kern_param.bias<dt_float32>(batch_id, group_id) + bias_offset;
            kern_param.bias<dt_float32>(batch_id, group_id, oc_idx, 1, pack_c) +
            bias_offset;

    Op op;
    conv_bias::conv_direct_fp32_nchw44<bias_mode, Op, filter, stride>(
--- a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw_nchw44_s1.cpp
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw_nchw44_s1.cpp
@@ -69,7 +69,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 2, oc_block, ow_block,
        constexpr int c_dim = OCHelper<oc_block>::val;

        int32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);
        for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
            int8x16_t src[src_reg];
            int8x16_t weight[c_dim][weight_reg];
@@ -117,7 +117,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 3, oc_block, ow_block,
        constexpr int c_dim = OCHelper<oc_block>::val;

        int32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);
        for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
            int8x16_t src[src_reg];
            int8x16_t weight[c_dim][weight_reg];
@@ -171,7 +171,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 5, oc_block, ow_block,
        constexpr int c_dim = OCHelper<oc_block>::val;

        int32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
            int8x16_t src[src_reg];
@@ -220,7 +220,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 7, oc_block, ow_block,
        constexpr int c_dim = OCHelper<oc_block>::val;

        int32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
            int8x16_t src[src_reg];
--- a/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw_nchw44_s2.cpp
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/dot_direct_nchw_nchw44_s2.cpp
@@ -80,7 +80,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 2, oc_block, ow_block,
        constexpr int c_dim = OCHelper<oc_block>::val;

        int32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);
        for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
            int8x16_t src[2][src_reg];
            int8x16_t weight[c_dim][weight_reg];
@@ -131,7 +131,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 3, oc_block, ow_block,
        constexpr int c_dim = OCHelper<oc_block>::val;

        int32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);
        for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
            int8x16_t src[2][src_reg];
            int8x16_t weight[c_dim][weight_reg];
@@ -189,7 +189,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 5, oc_block, ow_block,
        constexpr int c_dim = OCHelper<oc_block>::val;

        int32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
            int8x16_t src[2][src_reg];
@@ -244,7 +244,7 @@ struct KerNeonDotXXs2Nchw44Int8<bias_mode, Op, remain_w, 7, oc_block, ow_block,
        constexpr int c_dim = OCHelper<oc_block>::val;

        int32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode, ow_block>(c, bias_ptr, ld_bias);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, ld_bias);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += ic_step) {
            int8x16_t src[2][src_reg];
--- a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw44_s1.cpp
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw44_s1.cpp
@@ -45,7 +45,7 @@ static void ker_neon_dirctconv_2x2s1_oc8_ow8(const int8_t* src_ptr,
    int8x16_t src[8 + 1];
    int16x8_t temp_c[4];

    init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
    init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

    for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
        for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -135,7 +135,7 @@ static void ker_neon_dirctconv_2x2s1_oc4_ow8(const int8_t* src_ptr,
    int8x16_t weight[1][2];
    int8x16_t src[8 + 1];
    int16x8_t temp_c[2];
    init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
    init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

    for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
        for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -224,7 +224,7 @@ struct KerNeonDirectStride1Int8<bias_mode, Op, remain_w, 3, c_dim, DstType> {
        int8x16_t weight[3];
        int8x16_t src[8 + 2];
        int16x8_t temp_c[2];
        init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -306,7 +306,7 @@ struct KerNeonDirectStride1Int8<bias_mode, Op, remain_w, 5, c_dim, DstType> {
        int8x16_t weight[5];
        int8x16_t src[8 + 2];
        int16x8_t temp_c[2];
        init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -409,7 +409,7 @@ struct KerNeonDirectStride1Int8<bias_mode, Op, remain_w, 7, c_dim, DstType> {
        int8x16_t weight[7];
        int8x16_t src[8 + 2];
        int16x8_t temp_c[2];
        init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -569,7 +569,7 @@ void conv_direct_stride1_2x2_int8_nchw44(const int8_t* src,
                        (oh_idx * iw + ow_idx) * ic_step * pack_iw_len;
                const size_t dst_offset =
                        oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step;
                ker_neon_dirctconv_2x2s1_oc8_ow8<bias_mode, Op, 0, filter_size,
                ker_neon_dirctconv_2x2s1_oc8_ow8<bias_mode, Op, ow_step, filter_size,
                                                 2, DstType>(
                        src + src_offset, filter + weight_offset, bias + oc_idx,
                        dst + dst_offset, ic, ih, iw, ld_oc, op);
@@ -594,7 +594,7 @@ void conv_direct_stride1_2x2_int8_nchw44(const int8_t* src,
                        (oh_idx * iw + ow_idx) * ic_step * pack_iw_len;
                const size_t dst_offset =
                        oc_idx * img_stride + (oh_idx * ow + ow_idx) * oc_step;
                ker_neon_dirctconv_2x2s1_oc4_ow8<bias_mode, Op, 0, filter_size,
                ker_neon_dirctconv_2x2s1_oc4_ow8<bias_mode, Op, ow_step, filter_size,
                                                 1, DstType>(
                        src + src_offset, filter + weight_offset, bias + oc_idx,
                        dst + dst_offset, ic, ih, iw, ld_oc, op);
--- a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw44_s2.cpp
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw44_s2.cpp
@@ -54,7 +54,7 @@ static void ker_neon_dirctconv_2x2s2_oc8_ow8(const int8_t* src_ptr,
    int8x16_t src[8 + 1];
    int16x8_t temp_c[4];

    init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
    init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

    for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
        for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -151,7 +151,7 @@ static void ker_neon_dirctconv_2x2s2_oc4_ow8(const int8_t* src_ptr,
    int8x16_t weight[2];
    int8x16_t src[8 + 1];
    int16x8_t temp_c[2];
    init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
    init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

    for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
        for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -239,7 +239,7 @@ struct KerNeonDirectStride2Int8<bias_mode, Op, remain_w, 3, c_dim, DstType> {
        int8x16_t weight[3];
        int8x16_t src[8 + 2];
        int16x8_t temp_c[4];
        init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
@@ -327,7 +327,7 @@ struct KerNeonDirectStride2Int8<bias_mode, Op, remain_w, 5, c_dim, DstType> {
        int8x16_t weight[5];
        int8x16_t src[8 + 2];
        int16x8_t temp_c[4];
        init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
                const int8_t* src_ic_0_3 = src_ptr + ic_idx * ic_stride +
@@ -435,7 +435,7 @@ struct KerNeonDirectStride2Int8<bias_mode, Op, remain_w, 7, c_dim, DstType> {
        int8x16_t weight[7];
        int8x16_t src[8 + 2];
        int16x8_t temp_c[4];
        init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);
        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            for (int fh_idx = 0; fh_idx < fh; ++fh_idx) {
                const int8_t* src_ic_0_3 = src_ptr + ic_idx * ic_stride +
--- a/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.cpp
+++ b/dnn/src/arm_common/conv_bias/int8/direct_kernels/int8_direct_nchw_nchw44_s1.cpp
@@ -131,7 +131,7 @@ struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 2, oc_block, 1> {
        const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
        constexpr int c_dim = OCHelper<oc_block>::val;
        int32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
@@ -178,7 +178,7 @@ struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 3, oc_block, 1> {
        const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
        constexpr int c_dim = OCHelper<oc_block>::val;
        int32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
@@ -232,7 +232,7 @@ struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 5, oc_block, 1> {
        const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
        constexpr int c_dim = OCHelper<oc_block>::val;
        int32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
@@ -279,7 +279,7 @@ struct KerNeonXXs2NchwNchw44<bias_mode, Op, remain_w, 7, oc_block, 1> {
        const int ld_weight_oc = oc_step * filter_height * filter_width * ic;
        constexpr int c_dim = OCHelper<oc_block>::val;
        int32x4_t c[c_dim][8];
        init_ocx_ow8<c_dim, bias_mode, 8>(c, bias_ptr, oc_step);
        init_ocx_ow8<c_dim, bias_mode, remain_w>(c, bias_ptr, oc_step);

        for (int ic_idx = 0; ic_idx < ic; ic_idx += loop_ic_step) {
            const int8_t* nchw_src_ptr = src_ptr + ic_idx * ic_stride;
--- a/dnn/src/arm_common/conv_bias/intrinsic_helper.h
+++ b/dnn/src/arm_common/conv_bias/intrinsic_helper.h
@@ -643,128 +643,95 @@ __ai int32x4_t neon_vld1q(const int* ptr) {
 __ai int16x8_t neon_vld1q(const int16_t* ptr) {
    return vld1q_s16(ptr);
 }

 template <int c_dim, BiasMode bias_mode, int ow_block, typename T, typename T2>
 template <typename T>
 struct NeonLdqSimd;
 template <>
 struct NeonLdqSimd<float> {
    static constexpr int simd_len = 4;
 };
 template <>
 struct NeonLdqSimd<int> {
    static constexpr int simd_len = 4;
 };
 template <>
 struct NeonLdqSimd<int16_t> {
    static constexpr int simd_len = 8;
 };
 template <int c_dim, BiasMode bias_mode, int ow_remain, typename T, typename T2>
 struct InitOcxOw8 {
    static __ai void impl(T& c, const T2* bias_ptr, int oc_step);
 };
 template <typename T, typename T2>
 struct InitOcxOw8<2, BiasMode::NO_BIAS, 8, T, T2> {
    static __ai void impl(T& c, const T2*, int) {
 #define BAIS_INIT(step)                            \
    c[0][step] = neon_vdupq_n(static_cast<T2>(0)); \
    c[1][step] = neon_vdupq_n(static_cast<T2>(0));
        UNROLL_CALL_RAW(8, BAIS_INIT);
 #undef BAIS_INIT
    }
 template <int c_dim, BiasMode bias_mode, typename T, typename T2>
 struct InitOcxOw8<c_dim, bias_mode, 0, T, T2> {
    static __ai void impl(T&, const T2*, int) {}
 };
 template <typename T, typename T2>
 struct InitOcxOw8<2, BiasMode::NO_BIAS, 4, T, T2> {
    static __ai void impl(T& c, const T2*, int) {
 #define BAIS_INIT(step)                            \

 #define BAIS_INIT_NO_BIAS_C2(step)                 \
    c[0][step] = neon_vdupq_n(static_cast<T2>(0)); \
    c[1][step] = neon_vdupq_n(static_cast<T2>(0));
        UNROLL_CALL_RAW(4, BAIS_INIT);
 #undef BAIS_INIT
    }
 };
 template <typename T, typename T2>
 struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> {
    static __ai void impl(T& c, const T2* bias_ptr, int oc_step) {
 #define BAIS_INIT(step)                \
    c[0][step] = neon_vld1q(bias_ptr); \
    c[1][step] = neon_vld1q(bias_ptr + oc_step);
        UNROLL_CALL_RAW(8, BAIS_INIT);
 #undef BAIS_INIT
    }
 };
 template <typename T, typename T2>
 struct InitOcxOw8<2, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> {
    static __ai void impl(T& c, const T2* bias_ptr, int oc_step) {
 #define BAIS_INIT(step)                \
 #define BAIS_INIT_NO_BIAS_C1(step) \
    c[0][step] = neon_vdupq_n(static_cast<T2>(0));

 #define BAIS_INIT_BROADCAST_C2(step)   \
    c[0][step] = neon_vld1q(bias_ptr); \
    c[1][step] = neon_vld1q(bias_ptr + oc_step);
        UNROLL_CALL_RAW(4, BAIS_INIT);
 #undef BAIS_INIT
    }
 };
 template <typename T, typename T2>
 struct InitOcxOw8<2, BiasMode::BIAS, 8, T, T2> {
    static __ai void impl(T& c, const T2* bias_ptr, int oc_step) {
        constexpr int simd_len = 4;
 #define BAIS_INIT(step)                                  \
    c[0][step] = neon_vld1q(bias_ptr + step * simd_len); \
    c[1][step] = neon_vld1q(bias_ptr + oc_step + step * simd_len);
        UNROLL_CALL_RAW(8, BAIS_INIT);
 #undef BAIS_INIT
    }
 };
 template <typename T, typename T2>
 struct InitOcxOw8<2, BiasMode::BIAS, 4, T, T2> {
    static __ai void impl(T& c, const T2* bias_ptr, int oc_step) {
        constexpr int simd_len = 4;
 #define BAIS_INIT(step)                                  \
 #define BAIS_INIT_BROADCAST_C1(step) c[0][step] = neon_vld1q(bias_ptr);

 #define BAIS_INIT_BIAS_C2(step)                          \
    c[0][step] = neon_vld1q(bias_ptr + step * simd_len); \
    c[1][step] = neon_vld1q(bias_ptr + oc_step + step * simd_len);
        UNROLL_CALL_RAW(4, BAIS_INIT);
 #undef BAIS_INIT
    }
 };

 template <typename T, typename T2>
 struct InitOcxOw8<1, BiasMode::NO_BIAS, 8, T, T2> {
    static __ai void impl(T& c, const T2*, int) {
 #define BAIS_INIT(step) c[0][step] = neon_vdupq_n(static_cast<T2>(0));
        UNROLL_CALL_RAW(8, BAIS_INIT);
 #undef BAIS_INIT
    }
 };
 template <typename T, typename T2>
 struct InitOcxOw8<1, BiasMode::NO_BIAS, 4, T, T2> {
    static __ai void impl(T& c, const T2*, int) {
 #define BAIS_INIT(step) c[0][step] = neon_vdupq_n(static_cast<T2>(0));
        UNROLL_CALL_RAW(4, BAIS_INIT);
 #undef BAIS_INIT
    }
 };
 template <typename T, typename T2>
 struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 8, T, T2> {
    static __ai void impl(T& c, const T2* bias_ptr, int) {
 #define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr);
        UNROLL_CALL_RAW(8, BAIS_INIT);
 #undef BAIS_INIT
    }
 };
 template <typename T, typename T2>
 struct InitOcxOw8<1, BiasMode::BROADCAST_CHANNEL_BIAS, 4, T, T2> {
    static __ai void impl(T& c, const T2* bias_ptr, int) {
 #define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr);
        UNROLL_CALL_RAW(4, BAIS_INIT);
 #undef BAIS_INIT
    }
 };
 template <typename T, typename T2>
 struct InitOcxOw8<1, BiasMode::BIAS, 8, T, T2> {
    static __ai void impl(T& c, const T2* bias_ptr, int) {
        constexpr int simd_len = 4;
 #define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr + step * simd_len);
        UNROLL_CALL_RAW(8, BAIS_INIT);
 #undef BAIS_INIT
    }
 };
 template <typename T, typename T2>
 struct InitOcxOw8<1, BiasMode::BIAS, 4, T, T2> {
    static __ai void impl(T& c, const T2* bias_ptr, int) {
        constexpr int simd_len = 4;
 #define BAIS_INIT(step) c[0][step] = neon_vld1q(bias_ptr + step * simd_len);
        UNROLL_CALL_RAW(4, BAIS_INIT);
 #undef BAIS_INIT
    }
 };

 template <int c_dim, BiasMode bias_mode, int ow_block, typename T, typename T2>
 #define BAIS_INIT_BIAS_C1(step) \
    c[0][step] = neon_vld1q(bias_ptr + step * simd_len);

 #define INSTANCE_InitOcxOw8(ow_remain, cdim)                                \
    template <typename T, typename T2>                                      \
    struct InitOcxOw8<cdim, BiasMode::NO_BIAS, ow_remain, T, T2> {          \
        static __ai void impl(T& c, const T2*, int) {                       \
            UNROLL_CALL_RAW(ow_remain, BAIS_INIT_NO_BIAS_C##cdim);          \
        }                                                                   \
    };                                                                      \
    template <typename T, typename T2>                                      \
    struct InitOcxOw8<cdim, BiasMode::BROADCAST_CHANNEL_BIAS, ow_remain, T, \
                      T2> {                                                 \
        static __ai void impl(T& c, const T2* bias_ptr, int oc_step) {      \
            (void)oc_step;                                                  \
            UNROLL_CALL_RAW(ow_remain, BAIS_INIT_BROADCAST_C##cdim);        \
        }                                                                   \
    };                                                                      \
    template <typename T, typename T2>                                      \
    struct InitOcxOw8<cdim, BiasMode::BIAS, ow_remain, T, T2> {             \
        static __ai void impl(T& c, const T2* bias_ptr, int oc_step) {      \
            constexpr int simd_len = NeonLdqSimd<T2>::simd_len;             \
            (void)oc_step;                                                  \
            UNROLL_CALL_RAW(ow_remain, BAIS_INIT_BIAS_C##cdim);             \
        }                                                                   \
    };
 #define INSTANCE_InitOcxOw8_C(ow_remain) \
    INSTANCE_InitOcxOw8(ow_remain, 2);   \
    INSTANCE_InitOcxOw8(ow_remain, 1);

 INSTANCE_InitOcxOw8_C(1);
 INSTANCE_InitOcxOw8_C(2);
 INSTANCE_InitOcxOw8_C(3);
 INSTANCE_InitOcxOw8_C(4);
 INSTANCE_InitOcxOw8_C(5);
 INSTANCE_InitOcxOw8_C(6);
 INSTANCE_InitOcxOw8_C(7);
 INSTANCE_InitOcxOw8_C(8);

 #undef INSTANCE_InitOcxOw8
 #undef INSTANCE_InitOcxOw8_C
 #undef BAIS_INIT_BIAS_C1
 #undef BAIS_INIT_BIAS_C2
 #undef BAIS_INIT_BROADCAST_C1
 #undef BAIS_INIT_BROADCAST_C2
 #undef BAIS_INIT_NO_BIAS_C1
 #undef BAIS_INIT_NO_BIAS_C2

 template <int c_dim, BiasMode bias_mode, int ow_remain, typename T, typename T2>
 __ai void init_ocx_ow8(T& c, const T2* bias_ptr, int oc_step) {
    InitOcxOw8<c_dim, bias_mode, ow_block, T, T2>::impl(c, bias_ptr, oc_step);
    InitOcxOw8<c_dim, bias_mode, ow_remain, T, T2>::impl(c, bias_ptr, oc_step);
 }
 /////////////////////init_ocx_ow4/////////////////////
 template <int c_dim, BiasMode bias_mode, typename T>
--- a/dnn/src/arm_common/simd_macro/neon_helper.h
+++ b/dnn/src/arm_common/simd_macro/neon_helper.h
@@ -18,6 +18,8 @@
 #define MEGDNN_SIMD_TYPE float32x4_t
 #define MEGDNN_SIMD_TYPE2 float32x4x2_t
 #define MEGDNN_SIMD_LOADU(addr) vld1q_f32(addr)
 #define MEGDNN_SIMD_LOADU_2(addr) vcombine_f32(vld1_f32(addr), vdup_n_f32(0.f))
 #define MEGDNN_SIMD_LOADU_3(addr) vld1q_lane_f32(addr + 2, vcombine_f32(vld1_f32(addr), vdup_n_f32(0.f)), 2)
 #define MEGDNN_SIMD_STOREU(addr, reg) vst1q_f32(addr, reg)
 #define MEGDNN_SIMD_SETZERO() vdupq_n_f32(0.0f)
 #define MEGDNN_SIMD_SET1(num) vdupq_n_f32(num)
--- a/dnn/test/common/checker.h
+++ b/dnn/test/common/checker.h
@@ -23,6 +23,20 @@
 #include <regex>
 #include <unordered_map>

 // clang-format off
 #if defined(__has_feature) 
    #if __has_feature(address_sanitizer)
        #define MEGDNN_TEST_ASAN 1
    #else
        #define MEGDNN_TEST_ASAN 0
    #endif
 #elif defined(__SANITIZE_ADDRESS__) 
    #define MEGDNN_TEST_ASAN 1
 #else 
    #define MEGDNN_TEST_ASAN 0
 #endif
 // clang-format on

 namespace megdnn {
 namespace test {

--- a/dnn/test/fallback/warp_perspective.cpp
+++ b/dnn/test/fallback/warp_perspective.cpp
@@ -76,6 +76,9 @@ TEST_F(FALLBACK, WARP_PERSPECTIVE) {
        checker.set_param(param);
        checker.exec({{1000, 2, 10, 11}, {1000, 3, 3}, {1000, 2, 12, 13}});
    }
 #if MEGDNN_TEST_ASAN
 //! asan detect nan will make test failed
 #else
    // resize nan case
    UniformFloatRNG rng_zero(0, 0);
    checker.set_rng(1, &rng_zero);
@@ -85,6 +88,7 @@ TEST_F(FALLBACK, WARP_PERSPECTIVE) {
        checker.set_param(param);
        checker.exec({{1000, 2, 10, 11}, {1000, 3, 3}, {1000, 2, 12, 13}});
    }
 #endif
 }

 TEST_F(FALLBACK, WARP_PERSPECTIVE_MAT_IDX) {
--- a/dnn/test/naive/warp_perspective.cpp
+++ b/dnn/test/naive/warp_perspective.cpp
@@ -352,6 +352,9 @@ TEST_F(NAIVE_MULTI_THREADS, WARP_PERSPECTIVE_FORWARD_HWCD4) {
            checker.execs({{22, 10, 1, 11, 4}, {22, 3, 3}, {22, 11, 1, 12, 4}});
        }
    }
 #if MEGDNN_TEST_ASAN
 //! asan detect nan will make test failed
 #else
    // nan case
    NanMatRNG rng_nan;
    UniformFloatRNG rng_zero(0, 0);
@@ -369,6 +372,7 @@ TEST_F(NAIVE_MULTI_THREADS, WARP_PERSPECTIVE_FORWARD_HWCD4) {
        checker.set_param(param);
        checker.exec({{10, 10, 1, 11, 4}, {10, 3, 3}, {10, 12, 1, 13, 4}});
    }
 #endif
 }

 #if MEGDNN_WITH_BENCHMARK