arm neon assembly optimization for padding int8 pack8, convolution int8 out elempack 4

5 years ago · 68468dccbd
--- a/src/layer/arm/convolution_1x1_pack1to4_int8.h
+++ b/src/layer/arm/convolution_1x1_pack1to4_int8.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv1x1s1_sgemm_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 static void conv1x1s1_sgemm_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
@@ -22,10 +22,10 @@ static void conv1x1s1_sgemm_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack1to8_int8_neon(bottom_im2col, top_blob, kernel, opt);
    im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }

 static void conv1x1s2_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 static void conv1x1s2_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
@@ -79,5 +79,5 @@ static void conv1x1s2_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob,
        }
    }

    conv1x1s1_sgemm_pack1to8_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt);
    conv1x1s1_sgemm_pack1to4_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt);
 }
--- a/src/layer/arm/convolution_1x1_pack8to4_int8.h
+++ b/src/layer/arm/convolution_1x1_pack8to4_int8.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv1x1s1_sgemm_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 static void conv1x1s1_sgemm_pack8to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
@@ -22,10 +22,10 @@ static void conv1x1s1_sgemm_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blo
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack8_int8_neon(bottom_im2col, top_blob, kernel, opt);
    im2col_sgemm_pack8to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }

 static void conv1x1s2_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 static void conv1x1s2_pack8to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
@@ -86,5 +86,5 @@ static void conv1x1s2_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blob, con
        }
    }

    conv1x1s1_sgemm_pack8_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt);
    conv1x1s1_sgemm_pack8to4_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt);
 }
--- a/src/layer/arm/convolution_3x3_pack1to4_int8.h
+++ b/src/layer/arm/convolution_3x3_pack1to4_int8.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv3x3s1_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 static void conv3x3s1_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
@@ -76,10 +76,10 @@ static void conv3x3s1_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob,
        }
    }

    im2col_sgemm_pack1to8_int8_neon(bottom_im2col, top_blob, kernel, opt);
    im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }

 static void conv3x3s2_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 static void conv3x3s2_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
@@ -143,5 +143,5 @@ static void conv3x3s2_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob,
        }
    }

    im2col_sgemm_pack1to8_int8_neon(bottom_im2col, top_blob, kernel, opt);
    im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }
--- a/src/layer/arm/convolution_3x3_pack8to4_int8.h
+++ b/src/layer/arm/convolution_3x3_pack8to4_int8.h
--- a/src/layer/arm/convolution_7x7_pack1to4_int8.h
+++ b/src/layer/arm/convolution_7x7_pack1to4_int8.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv7x7s2_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 static void conv7x7s2_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
@@ -76,5 +76,5 @@ static void conv7x7s2_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob,
        }
    }

    im2col_sgemm_pack1to8_int8_neon(bottom_im2col, top_blob, kernel, opt);
    im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -70,18 +70,18 @@ namespace ncnn {
 #include "convolution_7x7_pack1to4_bf16s.h"

 #if NCNN_INT8
 #include "convolution_pack8_int8.h"
 #include "convolution_pack1to8_int8.h"
 #include "convolution_pack8to4_int8.h"
 #include "convolution_pack1to4_int8.h"
 #include "convolution_pack8to1_int8.h"
 #include "convolution_sgemm_pack8_int8.h"
 #include "convolution_sgemm_pack1to8_int8.h"
 #include "convolution_sgemm_pack8to4_int8.h"
 #include "convolution_sgemm_pack1to4_int8.h"
 #include "convolution_sgemm_pack8to1_int8.h"
 #include "convolution_1x1_pack8_int8.h"
 #include "convolution_1x1_pack1to8_int8.h"
 #include "convolution_1x1_pack8to4_int8.h"
 #include "convolution_1x1_pack1to4_int8.h"
 #include "convolution_1x1_pack8to1_int8.h"
 #include "convolution_3x3_pack8_int8.h"
 #include "convolution_3x3_pack1to8_int8.h"
 #include "convolution_7x7_pack1to8_int8.h"
 #include "convolution_3x3_pack8to4_int8.h"
 #include "convolution_3x3_pack1to4_int8.h"
 #include "convolution_7x7_pack1to4_int8.h"
 #include "convolution_3x3_pack8to1_int8.h"
 #endif // NCNN_INT8

@@ -1787,7 +1787,7 @@ int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
    if (opt.use_packing_layout)
    {
        elempack = num_input % 8 == 0 ? 8 : 1;
        out_elempack = num_output % 8 == 0 ? 8 : 1;
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
 #endif // __ARM_NEON

@@ -1855,15 +1855,15 @@ int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
    }

 #if __ARM_NEON
    if (elempack == 8 && out_elempack == 8)
    if (elempack == 8 && out_elempack == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
        }
 #if __ARM_FEATURE_DOTPROD
        else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 256 && num_output >= 256)
@@ -1871,39 +1871,39 @@ int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
        else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
 #endif
        {
            conv3x3s1_winograd42_transform_kernel_pack8_int8_neon(weight_data, weight_data_int8, num_input, num_output);
            conv3x3s1_winograd42_transform_kernel_pack8to4_int8_neon(weight_data, weight_data_int8, num_input, num_output);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_transform_kernel_pack8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
        }
    }

    if (elempack == 1 && out_elempack == 8)
    if (elempack == 1 && out_elempack == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
        }
    }

@@ -1966,7 +1966,7 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con
 #if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 8 == 0 ? 8 : 1;
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
 #endif // __ARM_NEON
    bool use_int8_requantize = int8_scale_term > 100;
@@ -1988,26 +1988,21 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con

    const int num_input = channels * elempack;

    Mat top_blob_int32;
    top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.workspace_allocator);
    if (top_blob_int32.empty())
        return -100;

 #if __ARM_NEON
    if (elempack == 8 && out_elempack == 8)
    if (elempack == 8 && out_elempack == 4)
    {
        Mat top_blob_int32;

        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator);
            if (top_blob_int32.empty())
                return -100;

            conv1x1s1_sgemm_pack8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
            conv1x1s1_sgemm_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator);
            if (top_blob_int32.empty())
                return -100;

            conv1x1s2_pack8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
            conv1x1s2_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
        }
 #if __ARM_FEATURE_DOTPROD
        else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 256 && num_output >= 256)
@@ -2015,27 +2010,15 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con
        else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
 #endif
        {
            top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.workspace_allocator);
            if (top_blob_int32.empty())
                return -100;

            conv3x3s1_winograd42_pack8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
            conv3x3s1_winograd42_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
        }
        else if (opt.use_sgemm_convolution)
        {
            top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator);
            if (top_blob_int32.empty())
                return -100;

            convolution_im2col_sgemm_pack8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
            convolution_im2col_sgemm_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.workspace_allocator);
            if (top_blob_int32.empty())
                return -100;

            convolution_pack8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
            convolution_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }

        Mat scale_in_data(num_output);
@@ -2066,65 +2049,35 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con
        }
    }

    if (elempack == 1 && out_elempack == 8)
    if (elempack == 1 && out_elempack == 4)
    {
        Mat top_blob_int32;

        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator);
            if (top_blob_int32.empty())
                return -100;

            conv1x1s1_sgemm_pack1to8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
            conv1x1s1_sgemm_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator);
            if (top_blob_int32.empty())
                return -100;

            conv1x1s2_pack1to8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
            conv1x1s2_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator);
            if (top_blob_int32.empty())
                return -100;

            conv3x3s1_pack1to8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
            conv3x3s1_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator);
            if (top_blob_int32.empty())
                return -100;

            conv3x3s2_pack1to8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
            conv3x3s2_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator);
            if (top_blob_int32.empty())
                return -100;

            conv7x7s2_pack1to8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
            conv7x7s2_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator);
            if (top_blob_int32.empty())
                return -100;

            convolution_im2col_sgemm_pack1to8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
            convolution_im2col_sgemm_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.workspace_allocator);
            if (top_blob_int32.empty())
                return -100;

            convolution_pack1to8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
            convolution_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }

        Mat scale_in_data(num_output);
@@ -2157,11 +2110,6 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con

    if (elempack == 8 && out_elempack == 1)
    {
        Mat top_blob_int32;
        top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.workspace_allocator);
        if (top_blob_int32.empty())
            return -100;

        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
@@ -2214,11 +2162,6 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con

    if (elempack == 1 && out_elempack == 1)
    {
        Mat top_blob_int32;
        top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.workspace_allocator);
        if (top_blob_int32.empty())
            return -100;

        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
--- a/src/layer/arm/convolution_pack1to4_int8.h
+++ b/src/layer/arm/convolution_pack1to4_int8.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void convolution_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 static void convolution_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
@@ -53,7 +53,6 @@ static void convolution_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob
            for (int j = 0; j < outw; j++)
            {
                int32x4_t _sum0 = vdupq_n_s32(0);
                int32x4_t _sum1 = vdupq_n_s32(0);

                const signed char* kptr = weight_data_int8.channel(p);

@@ -69,17 +68,15 @@ static void convolution_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob
                        int8x8_t _w = vld1_s8(kptr);
                        int16x8_t _s0 = vmull_s8(_val, _w);
                        _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                        _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));

                        kptr += 8;
                    }
                }

                vst1q_s32(outptr + j * 8, _sum0);
                vst1q_s32(outptr + j * 8 + 4, _sum1);
                vst1q_s32(outptr + j * 4, _sum0);
            }

            outptr += outw * 8;
            outptr += outw * 4;
        }
    }
 }
--- a/src/layer/arm/convolution_pack8to4_int8.h
+++ b/src/layer/arm/convolution_pack8to4_int8.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void convolution_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 static void convolution_pack8to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
@@ -54,8 +54,6 @@ static void convolution_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blob, c
            {
                int32x4_t _sum01 = vdupq_n_s32(0);
                int32x4_t _sum23 = vdupq_n_s32(0);
                int32x4_t _sum45 = vdupq_n_s32(0);
                int32x4_t _sum67 = vdupq_n_s32(0);

                const signed char* kptr = weight_data_int8.channel(p);

@@ -73,46 +71,30 @@ static void convolution_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blob, c
                        int8x8_t _w1 = vld1_s8(kptr + 8);
                        int8x8_t _w2 = vld1_s8(kptr + 16);
                        int8x8_t _w3 = vld1_s8(kptr + 24);
                        int8x8_t _w4 = vld1_s8(kptr + 32);
                        int8x8_t _w5 = vld1_s8(kptr + 40);
                        int8x8_t _w6 = vld1_s8(kptr + 48);
                        int8x8_t _w7 = vld1_s8(kptr + 56);

                        int16x8_t _wv0 = vmull_s8(_val, _w0);
                        int16x8_t _wv1 = vmull_s8(_val, _w1);
                        int16x8_t _wv2 = vmull_s8(_val, _w2);
                        int16x8_t _wv3 = vmull_s8(_val, _w3);
                        int16x8_t _wv4 = vmull_s8(_val, _w4);
                        int16x8_t _wv5 = vmull_s8(_val, _w5);
                        int16x8_t _wv6 = vmull_s8(_val, _w6);
                        int16x8_t _wv7 = vmull_s8(_val, _w7);

                        int16x4_t _wv00 = vpadd_s16(vget_low_s16(_wv0), vget_high_s16(_wv0));
                        int16x4_t _wv11 = vpadd_s16(vget_low_s16(_wv1), vget_high_s16(_wv1));
                        int16x4_t _wv22 = vpadd_s16(vget_low_s16(_wv2), vget_high_s16(_wv2));
                        int16x4_t _wv33 = vpadd_s16(vget_low_s16(_wv3), vget_high_s16(_wv3));
                        int16x4_t _wv44 = vpadd_s16(vget_low_s16(_wv4), vget_high_s16(_wv4));
                        int16x4_t _wv55 = vpadd_s16(vget_low_s16(_wv5), vget_high_s16(_wv5));
                        int16x4_t _wv66 = vpadd_s16(vget_low_s16(_wv6), vget_high_s16(_wv6));
                        int16x4_t _wv77 = vpadd_s16(vget_low_s16(_wv7), vget_high_s16(_wv7));

                        _sum01 = vpadalq_s16(_sum01, vcombine_s16(_wv00, _wv11));
                        _sum23 = vpadalq_s16(_sum23, vcombine_s16(_wv22, _wv33));
                        _sum45 = vpadalq_s16(_sum45, vcombine_s16(_wv44, _wv55));
                        _sum67 = vpadalq_s16(_sum67, vcombine_s16(_wv66, _wv77));

                        kptr += 64;
                        kptr += 32;
                    }
                }

                int32x4_t _sum0 = vcombine_s32(vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01)), vpadd_s32(vget_low_s32(_sum23), vget_high_s32(_sum23)));
                int32x4_t _sum1 = vcombine_s32(vpadd_s32(vget_low_s32(_sum45), vget_high_s32(_sum45)), vpadd_s32(vget_low_s32(_sum67), vget_high_s32(_sum67)));

                vst1q_s32(outptr + j * 8, _sum0);
                vst1q_s32(outptr + j * 8 + 4, _sum1);
                vst1q_s32(outptr + j * 4, _sum0);
            }

            outptr += outw * 8;
            outptr += outw * 4;
        }
    }
 }
--- a/src/layer/arm/convolution_sgemm_pack1to4_int8.h
+++ b/src/layer/arm/convolution_sgemm_pack1to4_int8.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void im2col_sgemm_pack1to8_int8_neon(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 static void im2col_sgemm_pack1to4_int8_neon(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

@@ -2422,7 +2422,7 @@ static void im2col_sgemm_pack1to8_int8_neon(const Mat& bottom_im2col, Mat& top_b
    }
 }

 static void convolution_im2col_sgemm_transform_kernel_pack1to8_int8_neon(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 static void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 {
    const int maxk = kernel_w * kernel_h;

@@ -2519,7 +2519,7 @@ static void convolution_im2col_sgemm_transform_kernel_pack1to8_int8_neon(const M
    }
 }

 static void convolution_im2col_sgemm_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 static void convolution_im2col_sgemm_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
@@ -2583,5 +2583,5 @@ static void convolution_im2col_sgemm_pack1to8_int8_neon(const Mat& bottom_blob,
        }
    }

    im2col_sgemm_pack1to8_int8_neon(bottom_im2col, top_blob, kernel, opt);
    im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }
--- a/src/layer/arm/convolution_sgemm_pack8to4_int8.h
+++ b/src/layer/arm/convolution_sgemm_pack8to4_int8.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void im2col_sgemm_pack8_int8_neon(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 static void im2col_sgemm_pack8to4_int8_neon(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

@@ -1103,7 +1103,7 @@ static void im2col_sgemm_pack8_int8_neon(const Mat& bottom_im2col, Mat& top_blob
    }
 }

 static void convolution_im2col_sgemm_transform_kernel_pack8_int8_neon(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 static void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
 {
    const int maxk = kernel_w * kernel_h;

@@ -1163,7 +1163,7 @@ static void convolution_im2col_sgemm_transform_kernel_pack8_int8_neon(const Mat&
    }
 }

 static void convolution_im2col_sgemm_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 static void convolution_im2col_sgemm_pack8to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
@@ -1234,5 +1234,5 @@ static void convolution_im2col_sgemm_pack8_int8_neon(const Mat& bottom_blob, Mat
        }
    }

    im2col_sgemm_pack8_int8_neon(bottom_im2col, top_blob, kernel, opt);
    im2col_sgemm_pack8to4_int8_neon(bottom_im2col, top_blob, kernel, opt);
 }
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -1538,31 +1538,31 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_
    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;

    int out_elempack = 1;
 #if __ARM_NEON
    if (opt.use_packing_layout)
    // depth-wise
    if (channels * elempack == group && group == num_output)
    {
        out_elempack = num_output % 8 == 0 ? 8 : 1;
    }
        int out_elempack = 1;
 #if __ARM_NEON
        if (opt.use_packing_layout)
        {
            out_elempack = num_output % 8 == 0 ? 8 : 1;
        }
 #endif // __ARM_NEON
    bool use_int8_requantize = int8_scale_term > 100;
    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
        bool use_int8_requantize = int8_scale_term > 100;
        size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    if (opt.use_fp16_storage)
    {
        out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;
    }
        if (opt.use_fp16_storage)
        {
            out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;
        }
 #endif
    if (opt.use_bf16_storage)
        out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;
        if (opt.use_bf16_storage)
            out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;
        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

    // depth-wise
    if (channels * elempack == group && group == num_output)
    {
        // TODO use fp16 / bf16
        out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
@@ -1934,6 +1934,28 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_
        return 0;
    }

    int out_elempack = 1;
 #if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
 #endif // __ARM_NEON
    bool use_int8_requantize = int8_scale_term > 100;
    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    if (opt.use_fp16_storage)
    {
        out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;
    }
 #endif
    if (opt.use_bf16_storage)
        out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // group convolution
    const int channels_g = channels * elempack / group;
    const int num_output_g = num_output / group;
@@ -1944,7 +1966,7 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_
    if (opt.use_packing_layout)
    {
        g_elempack = channels_g % 8 == 0 ? 8 : 1;
        out_g_elempack = num_output_g % 8 == 0 ? 8 : 1;
        out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
    }
 #endif // __ARM_NEON

--- a/src/layer/arm/padding_pack8_int8.h
+++ b/src/layer/arm/padding_pack8_int8.h
@@ -12,50 +12,328 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void padding_constant_pack8_int8_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int8x8_t _v)
 static void padding_constant_pack8_int8_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int8x8_t v)
 {
    const signed char* ptr = src;
    signed char* outptr = dst;

    // fill top
    for (int y = 0; y < top; y++)
    {
        for (int x = 0; x < dst.w; x++)
        {
            vst1_s8(outptr, _v);
            outptr += 8;
        }
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        for (int x = 0; x < left; x++)
        {
            vst1_s8(outptr, _v);
            outptr += 8;
        }
        for (int x = 0; x < src.w; x++)
        {
            int8x8_t _p = vld1_s8(ptr);
            vst1_s8(outptr, _p);
            ptr += 8;
            outptr += 8;
        }
        for (int x = 0; x < right; x++)
        {
            vst1_s8(outptr, _v);
            outptr += 8;
        }
    }
    // fill bottom
    for (int y = 0; y < bottom; y++)
    {
        for (int x = 0; x < dst.w; x++)
        {
            vst1_s8(outptr, _v);
            outptr += 8;
        }
    }
    int w = src.w;
    int h = src.h;

    int top_size = top * dst.w;
    int bottom_size = bottom * dst.w;

 #if __aarch64__
    asm volatile(
        "mov    v0.8b, %10.8b           \n"
        "mov    v0.d[1], v0.d[0]        \n"
        "mov    v1.16b, v0.16b          \n"
        "mov    v2.16b, v0.16b          \n"
        "mov    v3.16b, v0.16b          \n"

        // fill top
        "lsr    w4, %w8, #3             \n" // w4 = nn = top_size >> 3
        "cmp    w4, #0                  \n"
        "beq    1f                      \n"

        "0:                             \n"
        "st1    {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], #64 \n"
        "subs   w4, w4, #1              \n"
        "bne    0b                      \n"

        "1:                             \n"

        // fill top remain
        "and    w4, %w8, #7             \n" // w4 = remain = top_size & 7

        "cmp    w4, #4                  \n" // w4 >= 4
        "blt    2f                      \n"
        "sub    w4, w4, #4              \n"
        "st1    {v0.16b, v1.16b}, [%0], #32 \n"
        "2:                             \n"

        "cmp    w4, #2                  \n" // w4 >= 2
        "blt    3f                      \n"
        "sub    w4, w4, #2              \n"
        "st1    {v0.16b}, [%0], #16     \n"
        "3:                             \n"

        "cmp    w4, #0                  \n" // w4 > 0
        "beq    4f                      \n"
        "st1    {v0.8b}, [%0], #8       \n"
        "4:                             \n"

        // fill center h loop
        "cmp    %w5, #0                 \n"
        "beq    15f                     \n"
        "5:                             \n"

        // fill left
        "mov    w4, %w6                 \n" // w4 = left
        "cmp    w4, #0                  \n"
        "beq    7f                      \n"

        "6:                             \n"
        "st1    {v0.8b}, [%0], #8       \n"
        "subs   w4, w4, #1              \n"
        "bne    6b                      \n"

        "7:                             \n"

        // fill middle
        "lsr    w4, %w4, #3             \n" // w4 = nn = w >> 3
        "cmp    w4, #0                  \n"
        "beq    9f                      \n"

        "8:                             \n"
        "prfm   pldl1keep, [%1, #512]   \n"
        "ld1    {v16.16b, v17.16b, v18.16b, v19.16b}, [%1], #64 \n"
        "subs   w4, w4, #1              \n"
        "st1    {v16.16b, v17.16b, v18.16b, v19.16b}, [%0], #64 \n"
        "bne    8b                      \n"

        "9:                             \n"

        "and    w4, %w4, #7             \n" // w4 = remain = w & 7

        "cmp    w4, #4                  \n" // w4 >= 4
        "blt    10f                     \n"
        "prfm   pldl1keep, [%1, #256]   \n"
        "ld1    {v16.16b, v17.16b}, [%1], #32 \n"
        "sub    w4, w4, #4              \n"
        "st1    {v16.16b, v17.16b}, [%0], #32 \n"
        "10:                            \n"

        "cmp    w4, #2                  \n" // w4 >= 2
        "blt    11f                     \n"
        "prfm   pldl1keep, [%1, #128]   \n"
        "ld1    {v16.16b}, [%1], #16    \n"
        "sub    w4, w4, #2              \n"
        "st1    {v16.16b}, [%0], #16    \n"
        "11:                            \n"

        "cmp    w4, #0                  \n" // w4 > 0
        "beq    12f                     \n"
        "prfm   pldl1keep, [%1, #64]    \n"
        "ld1    {v16.8b}, [%1], #8      \n"
        "st1    {v16.8b}, [%0], #8      \n"
        "12:                            \n"

        // fill right
        "mov    w4, %w7                 \n" // w4 = right
        "cmp    w4, #0                  \n"
        "beq    14f                     \n"

        "13:                            \n"
        "subs   w4, w4, #1              \n"
        "st1    {v0.8b}, [%0], #8       \n"
        "bne    13b                     \n"
        "14:                            \n"

        "subs   %w5, %w5, #1            \n"
        "bne    5b                      \n"

        "15:                            \n"

        // fill bottom
        "lsr    w4, %w9, #3             \n" // w4 = nn = bottom_size >> 3
        "cmp    w4, #0                  \n"
        "beq    17f                     \n"

        "16:                            \n"
        "st1    {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], #64 \n"
        "subs   w4, w4, #1              \n"
        "bne    16b                     \n"
        "17:                            \n"

        // fill bottom remain
        "and    w4, %w9, #7             \n" // w4 = remain = bottom_size & 7

        "cmp    w4, #4                  \n" // w4 >= 4
        "blt    18f                     \n"
        "sub    w4, w4, #4              \n"
        "st1    {v0.16b, v1.16b}, [%0], #32 \n"
        "18:                            \n"

        "cmp    w4, #2                  \n" // w4 >= 2
        "blt    19f                     \n"
        "sub    w4, w4, #2              \n"
        "st1    {v0.16b}, [%0], #16     \n"
        "19:                            \n"

        "cmp    w4, #0                  \n" // w4 > 0
        "beq    20f                     \n"
        "st1    {v0.8b}, [%0], #8       \n"
        "20:                            \n"

        : "=r"(outptr), // %0
        "=r"(ptr)     // %1
        : "0"(outptr),
        "1"(ptr),
        "r"(w),           // %4
        "r"(h),           // %5
        "r"(left),        // %6
        "r"(right),       // %7
        "r"(top_size),    // %8
        "r"(bottom_size), // %9
        "w"(v)            // %10
        : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
 #else  // __aarch64__
    asm volatile(
        "vmov       d0, %P10            \n"
        "vmov       d1, d0              \n"
        "vmov       q1, q0              \n"
        "vmov       q2, q0              \n"
        "vmov       q3, q0              \n"

        // fill top
        "lsr        r4, %8, #3          \n" // r4 = nn = top_size >> 3
        "cmp        r4, #0              \n"
        "beq        1f                  \n"

        "0:                             \n"
        "vstm       %0!, {d0-d7}        \n"
        "subs       r4, r4, #1          \n"
        "bne        0b                  \n"

        "1:                             \n"

        // fill top remain
        "and        r4, %8, #7          \n" // r4 = remain = top_size & 7

        "cmp        r4, #4              \n" // r4 >= 4
        "blt        2f                  \n"
        "sub        r4, r4, #4          \n"
        "vst1.s8    {d0-d3}, [%0 :128]! \n"
        "2:                             \n"

        "cmp        r4, #2              \n" // r4 >= 2
        "blt        3f                  \n"
        "sub        r4, r4, #2          \n"
        "vst1.s8    {d0-d1}, [%0 :128]! \n"
        "3:                             \n"

        "cmp        r4, #0              \n" // r4 > 0
        "beq        4f                  \n"
        "vst1.s8    {d0}, [%0 :64]!     \n"
        "4:                             \n"

        // fill center h loop
        "cmp        %5, #0              \n"
        "beq        15f                 \n"
        "5:                             \n"

        // fill left
        "mov        r4, %6              \n" // r4 = left
        "cmp        r4, #0              \n"
        "beq        7f                  \n"

        "6:                             \n"
        "vst1.s8    {d0}, [%0 :64]!     \n"
        "subs       r4, r4, #1          \n"
        "bne        6b                  \n"

        "7:                             \n"

        // fill middle
        "lsr        r4, %4, #3          \n" // r4 = nn = w >> 3
        "cmp        r4, #0              \n"
        "beq        9f                  \n"

        "8:                             \n"
        "pld        [%1, #512]          \n"
        "vldm       %1!, {d16-d23}      \n"
        "subs       r4, r4, #1          \n"
        "vstm       %0!, {d16-d23}      \n"
        "bne        8b                  \n"

        "9:                             \n"

        "and        r4, %4, #7          \n" // r4 = remain = w & 7

        "cmp        r4, #4              \n" // r4 >= 4
        "blt        10f                 \n"
        "pld        [%1, #256]          \n"
        "vld1.s8    {d16-d19}, [%1 :64]! \n"
        "sub        r4, r4, #4          \n"
        "vst1.s8    {d16-d19}, [%0 :64]! \n"
        "10:                            \n"

        "cmp        r4, #2              \n" // r4 >= 2
        "blt        11f                 \n"
        "pld        [%1, #128]          \n"
        "vld1.s8    {d16-d17}, [%1 :64]! \n"
        "sub        r4, r4, #2          \n"
        "vst1.s8    {d16-d17}, [%0 :64]! \n"
        "11:                            \n"

        "cmp        r4, #0              \n" // r4 > 0
        "beq        12f                 \n"
        "pld        [%1, #64]           \n"
        "vld1.s8    {d16}, [%1 :64]!    \n"
        "vst1.s8    {d16}, [%0 :64]!    \n"
        "12:                            \n"

        // fill right
        "mov        r4, %7              \n" // r4 = right
        "cmp        r4, #0              \n"
        "beq        14f                 \n"

        "13:                            \n"
        "subs       r4, r4, #1          \n"
        "vst1.s8    {d0}, [%0 :64]!     \n"
        "bne        13b                 \n"
        "14:                            \n"

        "subs       %5, %5, #1          \n"
        "bne        5b                  \n"

        "15:                            \n"

        // fill bottom
        "lsr        r4, %9, #3          \n" // r4 = nn = bottom_size >> 3
        "cmp        r4, #0              \n"
        "beq        17f                 \n"

        "16:                            \n"
        "vstm       %0!, {d0-d7}        \n"
        "subs       r4, r4, #1          \n"
        "bne        16b                 \n"
        "17:                            \n"

        // fill bottom remain
        "and        r4, %9, #7          \n" // r4 = remain = bottom_size & 7

        "cmp        r4, #4              \n" // r4 >= 4
        "blt        18f                 \n"
        "sub        r4, r4, #4          \n"
        "vst1.s8    {d0-d3}, [%0 :64]!  \n"
        "18:                            \n"

        "cmp        r4, #2              \n" // r4 >= 2
        "blt        19f                 \n"
        "sub        r4, r4, #2          \n"
        "vst1.s8    {d0-d1}, [%0 :64]!  \n"
        "19:                            \n"

        "cmp        r4, #0              \n" // r4 > 0
        "beq        20f                 \n"
        "vst1.s8    {d0}, [%0 :64]!     \n"
        "20:                            \n"

        : "=r"(outptr), // %0
        "=r"(ptr)     // %1
        : "0"(outptr),
        "1"(ptr),
        "r"(w),           // %4
        "r"(h),           // %5
        "r"(left),        // %6
        "r"(right),       // %7
        "r"(top_size),    // %8
        "r"(bottom_size), // %9
        "w"(v)            // %10
        : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
 #endif // __aarch64__
 }

 static void padding_replicate_pack8_int8_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)