diff --git a/src/layer/x86/convolution_3x3_pack8to1.h b/src/layer/x86/convolution_3x3_pack8to1.h index d07f0c1b6..a163ffa4a 100644 --- a/src/layer/x86/convolution_3x3_pack8to1.h +++ b/src/layer/x86/convolution_3x3_pack8to1.h @@ -157,10 +157,12 @@ static void conv3x3s1_winograd64_transform_kernel_pack8to1_avx(const Mat& kernel int p = 0; for (; p + 7 < outch; p += 8) { - float* g00 = kernel_tm_pack8.channel(p / 8); + Mat g0 = kernel_tm_pack8.channel(p / 8); for (int k = 0; k < 64; k++) { + float* g00 = g0.row(k); + for (int q = 0; q + 7 < inch; q += 8) { for (int i = 0; i < 8; i++) @@ -181,10 +183,12 @@ static void conv3x3s1_winograd64_transform_kernel_pack8to1_avx(const Mat& kernel { const Mat k0 = kernel_tm.channel(p); - float* g00 = kernel_tm_pack8.channel(p / 8 + p % 8); + Mat g0 = kernel_tm_pack8.channel(p / 8 + p % 8); for (int k = 0; k < 64; k++) { + float* g00 = g0.row(k); + for (int q = 0; q + 7 < inch; q += 8) { for (int i = 0; i < 8; i++) diff --git a/src/layer/x86/convolution_3x3_pack8to1_int8.h b/src/layer/x86/convolution_3x3_pack8to1_int8.h index 6ae6ac98b..4cf050cae 100644 --- a/src/layer/x86/convolution_3x3_pack8to1_int8.h +++ b/src/layer/x86/convolution_3x3_pack8to1_int8.h @@ -974,7 +974,7 @@ static void conv3x3s1_winograd42_pack8to1_int8_sse(const Mat& bottom_blob, Mat& _sum7 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl3, _sh3)); k0 += 8; - r0 += 16; + r0 += 32; } _sum0 = _mm_add_epi32(_sum0, _sum1); diff --git a/src/layer/x86/convolution_sgemm_int8.h b/src/layer/x86/convolution_sgemm_int8.h index 6969d27a7..9d0038f69 100644 --- a/src/layer/x86/convolution_sgemm_int8.h +++ b/src/layer/x86/convolution_sgemm_int8.h @@ -940,8 +940,7 @@ static void im2col_sgemm_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const { __m128i _val = _mm_loadl_epi64((const __m128i*)tmpptr); __m128i _extval = _mm_cmpgt_epi8(_mm_setzero_si128(), _val); - __m128i _val0 = _mm_unpacklo_epi8(_val, _extval); - __m128i _val1 = _mm_unpacklo_epi8(_val, _extval); + __m128i _val01 = _mm_unpacklo_epi8(_val, _extval); __m128i _w0123 = _mm_loadl_epi64((const __m128i*)kptr0); #if __SSE4_1__ @@ -950,14 +949,13 @@ static void im2col_sgemm_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const __m128i _extw = _mm_cmpgt_epi8(_mm_setzero_si128(), _w0123); __m128i _w = _mm_unpacklo_epi8(_w0123, _extw); #endif + _w = _mm_shuffle_epi32(_w, _MM_SHUFFLE(1, 0, 1, 0)); - __m128i _sl0 = _mm_mullo_epi16(_val0, _w); - __m128i _sh0 = _mm_mulhi_epi16(_val0, _w); - __m128i _sl1 = _mm_mullo_epi16(_val1, _w); - __m128i _sh1 = _mm_mulhi_epi16(_val1, _w); + __m128i _sl01 = _mm_mullo_epi16(_val01, _w); + __m128i _sh01 = _mm_mulhi_epi16(_val01, _w); - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpacklo_epi16(_sl1, _sh1)); + _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl01, _sh01)); + _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl01, _sh01)); tmpptr += 8; kptr0 += 4; diff --git a/src/layer/x86/convolution_sgemm_pack1to4_int8.h b/src/layer/x86/convolution_sgemm_pack1to4_int8.h index b3934df1e..4627fa725 100644 --- a/src/layer/x86/convolution_sgemm_pack1to4_int8.h +++ b/src/layer/x86/convolution_sgemm_pack1to4_int8.h @@ -266,7 +266,7 @@ static void im2col_sgemm_pack1to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl for (; i + 3 < size; i += 4) { const signed char* tmpptr = tmp.channel(i / 4); - const signed char* kptr0 = kernel.channel(p / 4); + const signed char* kptr0 = kernel.channel(p); int nn4 = (inch / 4) * maxk; int nn1 = (inch % 4) * maxk; diff --git a/src/layer/x86/convolution_sgemm_pack8to1_int8.h b/src/layer/x86/convolution_sgemm_pack8to1_int8.h index 411a3f09a..d01e72def 100644 --- a/src/layer/x86/convolution_sgemm_pack8to1_int8.h +++ b/src/layer/x86/convolution_sgemm_pack8to1_int8.h @@ -704,7 +704,7 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl _sum4_6 = _mm256_add_epi32(_sum4_6, _sum5_7); __m128i _sum0 = _mm256_extracti128_si256(_sum0_2, 0); __m128i _sum2 = _mm256_extracti128_si256(_sum0_2, 1); - __m128i _sum4 = _mm256_extracti128_si256(_sum4_6, 1); + __m128i _sum4 = _mm256_extracti128_si256(_sum4_6, 0); __m128i _sum6 = _mm256_extracti128_si256(_sum4_6, 1); outptr0[0] = _mm_reduce_add_epi32(_sum0); diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp index 16a21c7ff..1bfdf1d20 100644 --- a/src/layer/x86/convolution_x86.cpp +++ b/src/layer/x86/convolution_x86.cpp @@ -1342,10 +1342,10 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt) { convolution_im2col_sgemm_transform_kernel_pack8to1_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); } - else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - { - conv3x3s1_winograd42_transform_kernel_pack8to1_int8_sse(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt); - } + // else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + // { + // conv3x3s1_winograd42_transform_kernel_pack8to1_int8_sse(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt); + // } else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) { convolution_im2col_sgemm_transform_kernel_pack8to1_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); @@ -1569,10 +1569,10 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con { conv1x1s2_sgemm_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); } - else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - { - conv3x3s1_winograd42_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_3x3_winograd42_data, opt); - } + // else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + // { + // conv3x3s1_winograd42_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_3x3_winograd42_data, opt); + // } else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) { convolution_im2col_sgemm_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);