| @@ -12,7 +12,7 @@ | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv1x1s1_sgemm_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| static void conv1x1s1_sgemm_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| @@ -22,10 +22,10 @@ static void conv1x1s1_sgemm_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_ | |||
| bottom_im2col.w = size; | |||
| bottom_im2col.h = 1; | |||
| im2col_sgemm_pack1to8_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| static void conv1x1s2_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| static void conv1x1s2_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| @@ -79,5 +79,5 @@ static void conv1x1s2_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, | |||
| } | |||
| } | |||
| conv1x1s1_sgemm_pack1to8_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt); | |||
| conv1x1s1_sgemm_pack1to4_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt); | |||
| } | |||
| @@ -12,7 +12,7 @@ | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv1x1s1_sgemm_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| static void conv1x1s1_sgemm_pack8to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| @@ -22,10 +22,10 @@ static void conv1x1s1_sgemm_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blo | |||
| bottom_im2col.w = size; | |||
| bottom_im2col.h = 1; | |||
| im2col_sgemm_pack8_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| im2col_sgemm_pack8to4_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| static void conv1x1s2_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| static void conv1x1s2_pack8to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| @@ -86,5 +86,5 @@ static void conv1x1s2_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| } | |||
| conv1x1s1_sgemm_pack8_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt); | |||
| conv1x1s1_sgemm_pack8to4_int8_neon(bottom_blob_shrinked, top_blob, kernel, opt); | |||
| } | |||
| @@ -12,7 +12,7 @@ | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv3x3s1_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| static void conv3x3s1_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| @@ -76,10 +76,10 @@ static void conv3x3s1_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, | |||
| } | |||
| } | |||
| im2col_sgemm_pack1to8_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| static void conv3x3s2_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| static void conv3x3s2_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| @@ -143,5 +143,5 @@ static void conv3x3s2_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, | |||
| } | |||
| } | |||
| im2col_sgemm_pack1to8_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| @@ -12,7 +12,7 @@ | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv7x7s2_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| static void conv7x7s2_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| @@ -76,5 +76,5 @@ static void conv7x7s2_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, | |||
| } | |||
| } | |||
| im2col_sgemm_pack1to8_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| @@ -70,18 +70,18 @@ namespace ncnn { | |||
| #include "convolution_7x7_pack1to4_bf16s.h" | |||
| #if NCNN_INT8 | |||
| #include "convolution_pack8_int8.h" | |||
| #include "convolution_pack1to8_int8.h" | |||
| #include "convolution_pack8to4_int8.h" | |||
| #include "convolution_pack1to4_int8.h" | |||
| #include "convolution_pack8to1_int8.h" | |||
| #include "convolution_sgemm_pack8_int8.h" | |||
| #include "convolution_sgemm_pack1to8_int8.h" | |||
| #include "convolution_sgemm_pack8to4_int8.h" | |||
| #include "convolution_sgemm_pack1to4_int8.h" | |||
| #include "convolution_sgemm_pack8to1_int8.h" | |||
| #include "convolution_1x1_pack8_int8.h" | |||
| #include "convolution_1x1_pack1to8_int8.h" | |||
| #include "convolution_1x1_pack8to4_int8.h" | |||
| #include "convolution_1x1_pack1to4_int8.h" | |||
| #include "convolution_1x1_pack8to1_int8.h" | |||
| #include "convolution_3x3_pack8_int8.h" | |||
| #include "convolution_3x3_pack1to8_int8.h" | |||
| #include "convolution_7x7_pack1to8_int8.h" | |||
| #include "convolution_3x3_pack8to4_int8.h" | |||
| #include "convolution_3x3_pack1to4_int8.h" | |||
| #include "convolution_7x7_pack1to4_int8.h" | |||
| #include "convolution_3x3_pack8to1_int8.h" | |||
| #endif // NCNN_INT8 | |||
| @@ -1787,7 +1787,7 @@ int Convolution_arm::create_pipeline_int8_arm(const Option& opt) | |||
| if (opt.use_packing_layout) | |||
| { | |||
| elempack = num_input % 8 == 0 ? 8 : 1; | |||
| out_elempack = num_output % 8 == 0 ? 8 : 1; | |||
| out_elempack = num_output % 4 == 0 ? 4 : 1; | |||
| } | |||
| #endif // __ARM_NEON | |||
| @@ -1855,15 +1855,15 @@ int Convolution_arm::create_pipeline_int8_arm(const Option& opt) | |||
| } | |||
| #if __ARM_NEON | |||
| if (elempack == 8 && out_elempack == 8) | |||
| if (elempack == 8 && out_elempack == 4) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| #if __ARM_FEATURE_DOTPROD | |||
| else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 256 && num_output >= 256) | |||
| @@ -1871,39 +1871,39 @@ int Convolution_arm::create_pipeline_int8_arm(const Option& opt) | |||
| else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| #endif | |||
| { | |||
| conv3x3s1_winograd42_transform_kernel_pack8_int8_neon(weight_data, weight_data_int8, num_input, num_output); | |||
| conv3x3s1_winograd42_transform_kernel_pack8to4_int8_neon(weight_data, weight_data_int8, num_input, num_output); | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| } | |||
| if (elempack == 1 && out_elempack == 8) | |||
| if (elempack == 1 && out_elempack == 4) | |||
| { | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) | |||
| { | |||
| convolution_im2col_sgemm_transform_kernel_pack1to8_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h); | |||
| } | |||
| } | |||
| @@ -1966,7 +1966,7 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con | |||
| #if __ARM_NEON | |||
| if (opt.use_packing_layout) | |||
| { | |||
| out_elempack = num_output % 8 == 0 ? 8 : 1; | |||
| out_elempack = num_output % 4 == 0 ? 4 : 1; | |||
| } | |||
| #endif // __ARM_NEON | |||
| bool use_int8_requantize = int8_scale_term > 100; | |||
| @@ -1988,26 +1988,21 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con | |||
| const int num_input = channels * elempack; | |||
| Mat top_blob_int32; | |||
| top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.workspace_allocator); | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| #if __ARM_NEON | |||
| if (elempack == 8 && out_elempack == 8) | |||
| if (elempack == 8 && out_elempack == 4) | |||
| { | |||
| Mat top_blob_int32; | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator); | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| conv1x1s1_sgemm_pack8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| conv1x1s1_sgemm_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator); | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| conv1x1s2_pack8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| conv1x1s2_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| } | |||
| #if __ARM_FEATURE_DOTPROD | |||
| else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 256 && num_output >= 256) | |||
| @@ -2015,27 +2010,15 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con | |||
| else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| #endif | |||
| { | |||
| top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.workspace_allocator); | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| conv3x3s1_winograd42_pack8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| conv3x3s1_winograd42_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| } | |||
| else if (opt.use_sgemm_convolution) | |||
| { | |||
| top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator); | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| convolution_im2col_sgemm_pack8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| convolution_im2col_sgemm_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| else | |||
| { | |||
| top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.workspace_allocator); | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| convolution_pack8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| convolution_pack8to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| Mat scale_in_data(num_output); | |||
| @@ -2066,65 +2049,35 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| } | |||
| if (elempack == 1 && out_elempack == 8) | |||
| if (elempack == 1 && out_elempack == 4) | |||
| { | |||
| Mat top_blob_int32; | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator); | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| conv1x1s1_sgemm_pack1to8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| conv1x1s1_sgemm_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| } | |||
| else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator); | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| conv1x1s2_pack1to8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| conv1x1s2_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator); | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| conv3x3s1_pack1to8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| conv3x3s1_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator); | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| conv3x3s2_pack1to8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| conv3x3s2_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| } | |||
| else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator); | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| conv7x7s2_pack1to8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| conv7x7s2_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| } | |||
| else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) | |||
| { | |||
| top_blob_int32.create(outw, outh, num_output / 4, (size_t)(4u * 4), 4, opt.workspace_allocator); | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| convolution_im2col_sgemm_pack1to8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| convolution_im2col_sgemm_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| else | |||
| { | |||
| top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.workspace_allocator); | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| convolution_pack1to8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| convolution_pack1to4_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); | |||
| } | |||
| Mat scale_in_data(num_output); | |||
| @@ -2157,11 +2110,6 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con | |||
| if (elempack == 8 && out_elempack == 1) | |||
| { | |||
| Mat top_blob_int32; | |||
| top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.workspace_allocator); | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_pack8to1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| @@ -2214,11 +2162,6 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con | |||
| if (elempack == 1 && out_elempack == 1) | |||
| { | |||
| Mat top_blob_int32; | |||
| top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.workspace_allocator); | |||
| if (top_blob_int32.empty()) | |||
| return -100; | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt); | |||
| @@ -12,7 +12,7 @@ | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void convolution_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| static void convolution_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| @@ -53,7 +53,6 @@ static void convolution_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob | |||
| for (int j = 0; j < outw; j++) | |||
| { | |||
| int32x4_t _sum0 = vdupq_n_s32(0); | |||
| int32x4_t _sum1 = vdupq_n_s32(0); | |||
| const signed char* kptr = weight_data_int8.channel(p); | |||
| @@ -69,17 +68,15 @@ static void convolution_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob | |||
| int8x8_t _w = vld1_s8(kptr); | |||
| int16x8_t _s0 = vmull_s8(_val, _w); | |||
| _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); | |||
| _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); | |||
| kptr += 8; | |||
| } | |||
| } | |||
| vst1q_s32(outptr + j * 8, _sum0); | |||
| vst1q_s32(outptr + j * 8 + 4, _sum1); | |||
| vst1q_s32(outptr + j * 4, _sum0); | |||
| } | |||
| outptr += outw * 8; | |||
| outptr += outw * 4; | |||
| } | |||
| } | |||
| } | |||
| @@ -12,7 +12,7 @@ | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void convolution_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| static void convolution_pack8to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int channels = bottom_blob.c; | |||
| @@ -54,8 +54,6 @@ static void convolution_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blob, c | |||
| { | |||
| int32x4_t _sum01 = vdupq_n_s32(0); | |||
| int32x4_t _sum23 = vdupq_n_s32(0); | |||
| int32x4_t _sum45 = vdupq_n_s32(0); | |||
| int32x4_t _sum67 = vdupq_n_s32(0); | |||
| const signed char* kptr = weight_data_int8.channel(p); | |||
| @@ -73,46 +71,30 @@ static void convolution_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blob, c | |||
| int8x8_t _w1 = vld1_s8(kptr + 8); | |||
| int8x8_t _w2 = vld1_s8(kptr + 16); | |||
| int8x8_t _w3 = vld1_s8(kptr + 24); | |||
| int8x8_t _w4 = vld1_s8(kptr + 32); | |||
| int8x8_t _w5 = vld1_s8(kptr + 40); | |||
| int8x8_t _w6 = vld1_s8(kptr + 48); | |||
| int8x8_t _w7 = vld1_s8(kptr + 56); | |||
| int16x8_t _wv0 = vmull_s8(_val, _w0); | |||
| int16x8_t _wv1 = vmull_s8(_val, _w1); | |||
| int16x8_t _wv2 = vmull_s8(_val, _w2); | |||
| int16x8_t _wv3 = vmull_s8(_val, _w3); | |||
| int16x8_t _wv4 = vmull_s8(_val, _w4); | |||
| int16x8_t _wv5 = vmull_s8(_val, _w5); | |||
| int16x8_t _wv6 = vmull_s8(_val, _w6); | |||
| int16x8_t _wv7 = vmull_s8(_val, _w7); | |||
| int16x4_t _wv00 = vpadd_s16(vget_low_s16(_wv0), vget_high_s16(_wv0)); | |||
| int16x4_t _wv11 = vpadd_s16(vget_low_s16(_wv1), vget_high_s16(_wv1)); | |||
| int16x4_t _wv22 = vpadd_s16(vget_low_s16(_wv2), vget_high_s16(_wv2)); | |||
| int16x4_t _wv33 = vpadd_s16(vget_low_s16(_wv3), vget_high_s16(_wv3)); | |||
| int16x4_t _wv44 = vpadd_s16(vget_low_s16(_wv4), vget_high_s16(_wv4)); | |||
| int16x4_t _wv55 = vpadd_s16(vget_low_s16(_wv5), vget_high_s16(_wv5)); | |||
| int16x4_t _wv66 = vpadd_s16(vget_low_s16(_wv6), vget_high_s16(_wv6)); | |||
| int16x4_t _wv77 = vpadd_s16(vget_low_s16(_wv7), vget_high_s16(_wv7)); | |||
| _sum01 = vpadalq_s16(_sum01, vcombine_s16(_wv00, _wv11)); | |||
| _sum23 = vpadalq_s16(_sum23, vcombine_s16(_wv22, _wv33)); | |||
| _sum45 = vpadalq_s16(_sum45, vcombine_s16(_wv44, _wv55)); | |||
| _sum67 = vpadalq_s16(_sum67, vcombine_s16(_wv66, _wv77)); | |||
| kptr += 64; | |||
| kptr += 32; | |||
| } | |||
| } | |||
| int32x4_t _sum0 = vcombine_s32(vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01)), vpadd_s32(vget_low_s32(_sum23), vget_high_s32(_sum23))); | |||
| int32x4_t _sum1 = vcombine_s32(vpadd_s32(vget_low_s32(_sum45), vget_high_s32(_sum45)), vpadd_s32(vget_low_s32(_sum67), vget_high_s32(_sum67))); | |||
| vst1q_s32(outptr + j * 8, _sum0); | |||
| vst1q_s32(outptr + j * 8 + 4, _sum1); | |||
| vst1q_s32(outptr + j * 4, _sum0); | |||
| } | |||
| outptr += outw * 8; | |||
| outptr += outw * 4; | |||
| } | |||
| } | |||
| } | |||
| @@ -12,7 +12,7 @@ | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void im2col_sgemm_pack1to8_int8_neon(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| static void im2col_sgemm_pack1to4_int8_neon(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); | |||
| @@ -2422,7 +2422,7 @@ static void im2col_sgemm_pack1to8_int8_neon(const Mat& bottom_im2col, Mat& top_b | |||
| } | |||
| } | |||
| static void convolution_im2col_sgemm_transform_kernel_pack1to8_int8_neon(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| static void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_neon(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| { | |||
| const int maxk = kernel_w * kernel_h; | |||
| @@ -2519,7 +2519,7 @@ static void convolution_im2col_sgemm_transform_kernel_pack1to8_int8_neon(const M | |||
| } | |||
| } | |||
| static void convolution_im2col_sgemm_pack1to8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| static void convolution_im2col_sgemm_pack1to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| @@ -2583,5 +2583,5 @@ static void convolution_im2col_sgemm_pack1to8_int8_neon(const Mat& bottom_blob, | |||
| } | |||
| } | |||
| im2col_sgemm_pack1to8_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| im2col_sgemm_pack1to4_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| @@ -12,7 +12,7 @@ | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void im2col_sgemm_pack8_int8_neon(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| static void im2col_sgemm_pack8to4_int8_neon(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) | |||
| { | |||
| // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); | |||
| @@ -1103,7 +1103,7 @@ static void im2col_sgemm_pack8_int8_neon(const Mat& bottom_im2col, Mat& top_blob | |||
| } | |||
| } | |||
| static void convolution_im2col_sgemm_transform_kernel_pack8_int8_neon(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| static void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_neon(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) | |||
| { | |||
| const int maxk = kernel_w * kernel_h; | |||
| @@ -1163,7 +1163,7 @@ static void convolution_im2col_sgemm_transform_kernel_pack8_int8_neon(const Mat& | |||
| } | |||
| } | |||
| static void convolution_im2col_sgemm_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| static void convolution_im2col_sgemm_pack8to4_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| @@ -1234,5 +1234,5 @@ static void convolution_im2col_sgemm_pack8_int8_neon(const Mat& bottom_blob, Mat | |||
| } | |||
| } | |||
| im2col_sgemm_pack8_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| im2col_sgemm_pack8to4_int8_neon(bottom_im2col, top_blob, kernel, opt); | |||
| } | |||
| @@ -1538,31 +1538,31 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_ | |||
| int outw = (w - kernel_extent_w) / stride_w + 1; | |||
| int outh = (h - kernel_extent_h) / stride_h + 1; | |||
| int out_elempack = 1; | |||
| #if __ARM_NEON | |||
| if (opt.use_packing_layout) | |||
| // depth-wise | |||
| if (channels * elempack == group && group == num_output) | |||
| { | |||
| out_elempack = num_output % 8 == 0 ? 8 : 1; | |||
| } | |||
| int out_elempack = 1; | |||
| #if __ARM_NEON | |||
| if (opt.use_packing_layout) | |||
| { | |||
| out_elempack = num_output % 8 == 0 ? 8 : 1; | |||
| } | |||
| #endif // __ARM_NEON | |||
| bool use_int8_requantize = int8_scale_term > 100; | |||
| size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; | |||
| bool use_int8_requantize = int8_scale_term > 100; | |||
| size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| if (opt.use_fp16_storage) | |||
| { | |||
| out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; | |||
| } | |||
| if (opt.use_fp16_storage) | |||
| { | |||
| out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; | |||
| } | |||
| #endif | |||
| if (opt.use_bf16_storage) | |||
| out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; | |||
| if (opt.use_bf16_storage) | |||
| out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // depth-wise | |||
| if (channels * elempack == group && group == num_output) | |||
| { | |||
| // TODO use fp16 / bf16 | |||
| out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); | |||
| @@ -1934,6 +1934,28 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_ | |||
| return 0; | |||
| } | |||
| int out_elempack = 1; | |||
| #if __ARM_NEON | |||
| if (opt.use_packing_layout) | |||
| { | |||
| out_elempack = num_output % 4 == 0 ? 4 : 1; | |||
| } | |||
| #endif // __ARM_NEON | |||
| bool use_int8_requantize = int8_scale_term > 100; | |||
| size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; | |||
| #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
| if (opt.use_fp16_storage) | |||
| { | |||
| out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; | |||
| } | |||
| #endif | |||
| if (opt.use_bf16_storage) | |||
| out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // group convolution | |||
| const int channels_g = channels * elempack / group; | |||
| const int num_output_g = num_output / group; | |||
| @@ -1944,7 +1966,7 @@ int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_ | |||
| if (opt.use_packing_layout) | |||
| { | |||
| g_elempack = channels_g % 8 == 0 ? 8 : 1; | |||
| out_g_elempack = num_output_g % 8 == 0 ? 8 : 1; | |||
| out_g_elempack = num_output_g % 4 == 0 ? 4 : 1; | |||
| } | |||
| #endif // __ARM_NEON | |||
| @@ -12,50 +12,328 @@ | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void padding_constant_pack8_int8_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int8x8_t _v) | |||
| static void padding_constant_pack8_int8_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int8x8_t v) | |||
| { | |||
| const signed char* ptr = src; | |||
| signed char* outptr = dst; | |||
| // fill top | |||
| for (int y = 0; y < top; y++) | |||
| { | |||
| for (int x = 0; x < dst.w; x++) | |||
| { | |||
| vst1_s8(outptr, _v); | |||
| outptr += 8; | |||
| } | |||
| } | |||
| // fill center | |||
| for (int y = 0; y < src.h; y++) | |||
| { | |||
| for (int x = 0; x < left; x++) | |||
| { | |||
| vst1_s8(outptr, _v); | |||
| outptr += 8; | |||
| } | |||
| for (int x = 0; x < src.w; x++) | |||
| { | |||
| int8x8_t _p = vld1_s8(ptr); | |||
| vst1_s8(outptr, _p); | |||
| ptr += 8; | |||
| outptr += 8; | |||
| } | |||
| for (int x = 0; x < right; x++) | |||
| { | |||
| vst1_s8(outptr, _v); | |||
| outptr += 8; | |||
| } | |||
| } | |||
| // fill bottom | |||
| for (int y = 0; y < bottom; y++) | |||
| { | |||
| for (int x = 0; x < dst.w; x++) | |||
| { | |||
| vst1_s8(outptr, _v); | |||
| outptr += 8; | |||
| } | |||
| } | |||
| int w = src.w; | |||
| int h = src.h; | |||
| int top_size = top * dst.w; | |||
| int bottom_size = bottom * dst.w; | |||
| #if __aarch64__ | |||
| asm volatile( | |||
| "mov v0.8b, %10.8b \n" | |||
| "mov v0.d[1], v0.d[0] \n" | |||
| "mov v1.16b, v0.16b \n" | |||
| "mov v2.16b, v0.16b \n" | |||
| "mov v3.16b, v0.16b \n" | |||
| // fill top | |||
| "lsr w4, %w8, #3 \n" // w4 = nn = top_size >> 3 | |||
| "cmp w4, #0 \n" | |||
| "beq 1f \n" | |||
| "0: \n" | |||
| "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], #64 \n" | |||
| "subs w4, w4, #1 \n" | |||
| "bne 0b \n" | |||
| "1: \n" | |||
| // fill top remain | |||
| "and w4, %w8, #7 \n" // w4 = remain = top_size & 7 | |||
| "cmp w4, #4 \n" // w4 >= 4 | |||
| "blt 2f \n" | |||
| "sub w4, w4, #4 \n" | |||
| "st1 {v0.16b, v1.16b}, [%0], #32 \n" | |||
| "2: \n" | |||
| "cmp w4, #2 \n" // w4 >= 2 | |||
| "blt 3f \n" | |||
| "sub w4, w4, #2 \n" | |||
| "st1 {v0.16b}, [%0], #16 \n" | |||
| "3: \n" | |||
| "cmp w4, #0 \n" // w4 > 0 | |||
| "beq 4f \n" | |||
| "st1 {v0.8b}, [%0], #8 \n" | |||
| "4: \n" | |||
| // fill center h loop | |||
| "cmp %w5, #0 \n" | |||
| "beq 15f \n" | |||
| "5: \n" | |||
| // fill left | |||
| "mov w4, %w6 \n" // w4 = left | |||
| "cmp w4, #0 \n" | |||
| "beq 7f \n" | |||
| "6: \n" | |||
| "st1 {v0.8b}, [%0], #8 \n" | |||
| "subs w4, w4, #1 \n" | |||
| "bne 6b \n" | |||
| "7: \n" | |||
| // fill middle | |||
| "lsr w4, %w4, #3 \n" // w4 = nn = w >> 3 | |||
| "cmp w4, #0 \n" | |||
| "beq 9f \n" | |||
| "8: \n" | |||
| "prfm pldl1keep, [%1, #512] \n" | |||
| "ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%1], #64 \n" | |||
| "subs w4, w4, #1 \n" | |||
| "st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%0], #64 \n" | |||
| "bne 8b \n" | |||
| "9: \n" | |||
| "and w4, %w4, #7 \n" // w4 = remain = w & 7 | |||
| "cmp w4, #4 \n" // w4 >= 4 | |||
| "blt 10f \n" | |||
| "prfm pldl1keep, [%1, #256] \n" | |||
| "ld1 {v16.16b, v17.16b}, [%1], #32 \n" | |||
| "sub w4, w4, #4 \n" | |||
| "st1 {v16.16b, v17.16b}, [%0], #32 \n" | |||
| "10: \n" | |||
| "cmp w4, #2 \n" // w4 >= 2 | |||
| "blt 11f \n" | |||
| "prfm pldl1keep, [%1, #128] \n" | |||
| "ld1 {v16.16b}, [%1], #16 \n" | |||
| "sub w4, w4, #2 \n" | |||
| "st1 {v16.16b}, [%0], #16 \n" | |||
| "11: \n" | |||
| "cmp w4, #0 \n" // w4 > 0 | |||
| "beq 12f \n" | |||
| "prfm pldl1keep, [%1, #64] \n" | |||
| "ld1 {v16.8b}, [%1], #8 \n" | |||
| "st1 {v16.8b}, [%0], #8 \n" | |||
| "12: \n" | |||
| // fill right | |||
| "mov w4, %w7 \n" // w4 = right | |||
| "cmp w4, #0 \n" | |||
| "beq 14f \n" | |||
| "13: \n" | |||
| "subs w4, w4, #1 \n" | |||
| "st1 {v0.8b}, [%0], #8 \n" | |||
| "bne 13b \n" | |||
| "14: \n" | |||
| "subs %w5, %w5, #1 \n" | |||
| "bne 5b \n" | |||
| "15: \n" | |||
| // fill bottom | |||
| "lsr w4, %w9, #3 \n" // w4 = nn = bottom_size >> 3 | |||
| "cmp w4, #0 \n" | |||
| "beq 17f \n" | |||
| "16: \n" | |||
| "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], #64 \n" | |||
| "subs w4, w4, #1 \n" | |||
| "bne 16b \n" | |||
| "17: \n" | |||
| // fill bottom remain | |||
| "and w4, %w9, #7 \n" // w4 = remain = bottom_size & 7 | |||
| "cmp w4, #4 \n" // w4 >= 4 | |||
| "blt 18f \n" | |||
| "sub w4, w4, #4 \n" | |||
| "st1 {v0.16b, v1.16b}, [%0], #32 \n" | |||
| "18: \n" | |||
| "cmp w4, #2 \n" // w4 >= 2 | |||
| "blt 19f \n" | |||
| "sub w4, w4, #2 \n" | |||
| "st1 {v0.16b}, [%0], #16 \n" | |||
| "19: \n" | |||
| "cmp w4, #0 \n" // w4 > 0 | |||
| "beq 20f \n" | |||
| "st1 {v0.8b}, [%0], #8 \n" | |||
| "20: \n" | |||
| : "=r"(outptr), // %0 | |||
| "=r"(ptr) // %1 | |||
| : "0"(outptr), | |||
| "1"(ptr), | |||
| "r"(w), // %4 | |||
| "r"(h), // %5 | |||
| "r"(left), // %6 | |||
| "r"(right), // %7 | |||
| "r"(top_size), // %8 | |||
| "r"(bottom_size), // %9 | |||
| "w"(v) // %10 | |||
| : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); | |||
| #else // __aarch64__ | |||
| asm volatile( | |||
| "vmov d0, %P10 \n" | |||
| "vmov d1, d0 \n" | |||
| "vmov q1, q0 \n" | |||
| "vmov q2, q0 \n" | |||
| "vmov q3, q0 \n" | |||
| // fill top | |||
| "lsr r4, %8, #3 \n" // r4 = nn = top_size >> 3 | |||
| "cmp r4, #0 \n" | |||
| "beq 1f \n" | |||
| "0: \n" | |||
| "vstm %0!, {d0-d7} \n" | |||
| "subs r4, r4, #1 \n" | |||
| "bne 0b \n" | |||
| "1: \n" | |||
| // fill top remain | |||
| "and r4, %8, #7 \n" // r4 = remain = top_size & 7 | |||
| "cmp r4, #4 \n" // r4 >= 4 | |||
| "blt 2f \n" | |||
| "sub r4, r4, #4 \n" | |||
| "vst1.s8 {d0-d3}, [%0 :128]! \n" | |||
| "2: \n" | |||
| "cmp r4, #2 \n" // r4 >= 2 | |||
| "blt 3f \n" | |||
| "sub r4, r4, #2 \n" | |||
| "vst1.s8 {d0-d1}, [%0 :128]! \n" | |||
| "3: \n" | |||
| "cmp r4, #0 \n" // r4 > 0 | |||
| "beq 4f \n" | |||
| "vst1.s8 {d0}, [%0 :64]! \n" | |||
| "4: \n" | |||
| // fill center h loop | |||
| "cmp %5, #0 \n" | |||
| "beq 15f \n" | |||
| "5: \n" | |||
| // fill left | |||
| "mov r4, %6 \n" // r4 = left | |||
| "cmp r4, #0 \n" | |||
| "beq 7f \n" | |||
| "6: \n" | |||
| "vst1.s8 {d0}, [%0 :64]! \n" | |||
| "subs r4, r4, #1 \n" | |||
| "bne 6b \n" | |||
| "7: \n" | |||
| // fill middle | |||
| "lsr r4, %4, #3 \n" // r4 = nn = w >> 3 | |||
| "cmp r4, #0 \n" | |||
| "beq 9f \n" | |||
| "8: \n" | |||
| "pld [%1, #512] \n" | |||
| "vldm %1!, {d16-d23} \n" | |||
| "subs r4, r4, #1 \n" | |||
| "vstm %0!, {d16-d23} \n" | |||
| "bne 8b \n" | |||
| "9: \n" | |||
| "and r4, %4, #7 \n" // r4 = remain = w & 7 | |||
| "cmp r4, #4 \n" // r4 >= 4 | |||
| "blt 10f \n" | |||
| "pld [%1, #256] \n" | |||
| "vld1.s8 {d16-d19}, [%1 :64]! \n" | |||
| "sub r4, r4, #4 \n" | |||
| "vst1.s8 {d16-d19}, [%0 :64]! \n" | |||
| "10: \n" | |||
| "cmp r4, #2 \n" // r4 >= 2 | |||
| "blt 11f \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.s8 {d16-d17}, [%1 :64]! \n" | |||
| "sub r4, r4, #2 \n" | |||
| "vst1.s8 {d16-d17}, [%0 :64]! \n" | |||
| "11: \n" | |||
| "cmp r4, #0 \n" // r4 > 0 | |||
| "beq 12f \n" | |||
| "pld [%1, #64] \n" | |||
| "vld1.s8 {d16}, [%1 :64]! \n" | |||
| "vst1.s8 {d16}, [%0 :64]! \n" | |||
| "12: \n" | |||
| // fill right | |||
| "mov r4, %7 \n" // r4 = right | |||
| "cmp r4, #0 \n" | |||
| "beq 14f \n" | |||
| "13: \n" | |||
| "subs r4, r4, #1 \n" | |||
| "vst1.s8 {d0}, [%0 :64]! \n" | |||
| "bne 13b \n" | |||
| "14: \n" | |||
| "subs %5, %5, #1 \n" | |||
| "bne 5b \n" | |||
| "15: \n" | |||
| // fill bottom | |||
| "lsr r4, %9, #3 \n" // r4 = nn = bottom_size >> 3 | |||
| "cmp r4, #0 \n" | |||
| "beq 17f \n" | |||
| "16: \n" | |||
| "vstm %0!, {d0-d7} \n" | |||
| "subs r4, r4, #1 \n" | |||
| "bne 16b \n" | |||
| "17: \n" | |||
| // fill bottom remain | |||
| "and r4, %9, #7 \n" // r4 = remain = bottom_size & 7 | |||
| "cmp r4, #4 \n" // r4 >= 4 | |||
| "blt 18f \n" | |||
| "sub r4, r4, #4 \n" | |||
| "vst1.s8 {d0-d3}, [%0 :64]! \n" | |||
| "18: \n" | |||
| "cmp r4, #2 \n" // r4 >= 2 | |||
| "blt 19f \n" | |||
| "sub r4, r4, #2 \n" | |||
| "vst1.s8 {d0-d1}, [%0 :64]! \n" | |||
| "19: \n" | |||
| "cmp r4, #0 \n" // r4 > 0 | |||
| "beq 20f \n" | |||
| "vst1.s8 {d0}, [%0 :64]! \n" | |||
| "20: \n" | |||
| : "=r"(outptr), // %0 | |||
| "=r"(ptr) // %1 | |||
| : "0"(outptr), | |||
| "1"(ptr), | |||
| "r"(w), // %4 | |||
| "r"(h), // %5 | |||
| "r"(left), // %6 | |||
| "r"(right), // %7 | |||
| "r"(top_size), // %8 | |||
| "r"(bottom_size), // %9 | |||
| "w"(v) // %10 | |||
| : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); | |||
| #endif // __aarch64__ | |||
| } | |||
| static void padding_replicate_pack8_int8_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right) | |||