Browse Source

fix ci, first try

tags/20220216
nihuini 4 years ago
parent
commit
cfedcfdc57
No known key found for this signature in database GPG Key ID: 98FD8F4EBC3E5DB8
6 changed files with 23 additions and 21 deletions
  1. +6
    -2
      src/layer/x86/convolution_3x3_pack8to1.h
  2. +1
    -1
      src/layer/x86/convolution_3x3_pack8to1_int8.h
  3. +6
    -8
      src/layer/x86/convolution_sgemm_int8.h
  4. +1
    -1
      src/layer/x86/convolution_sgemm_pack1to4_int8.h
  5. +1
    -1
      src/layer/x86/convolution_sgemm_pack8to1_int8.h
  6. +8
    -8
      src/layer/x86/convolution_x86.cpp

+ 6
- 2
src/layer/x86/convolution_3x3_pack8to1.h View File

@@ -157,10 +157,12 @@ static void conv3x3s1_winograd64_transform_kernel_pack8to1_avx(const Mat& kernel
int p = 0;
for (; p + 7 < outch; p += 8)
{
float* g00 = kernel_tm_pack8.channel(p / 8);
Mat g0 = kernel_tm_pack8.channel(p / 8);

for (int k = 0; k < 64; k++)
{
float* g00 = g0.row(k);

for (int q = 0; q + 7 < inch; q += 8)
{
for (int i = 0; i < 8; i++)
@@ -181,10 +183,12 @@ static void conv3x3s1_winograd64_transform_kernel_pack8to1_avx(const Mat& kernel
{
const Mat k0 = kernel_tm.channel(p);

float* g00 = kernel_tm_pack8.channel(p / 8 + p % 8);
Mat g0 = kernel_tm_pack8.channel(p / 8 + p % 8);

for (int k = 0; k < 64; k++)
{
float* g00 = g0.row(k);

for (int q = 0; q + 7 < inch; q += 8)
{
for (int i = 0; i < 8; i++)


+ 1
- 1
src/layer/x86/convolution_3x3_pack8to1_int8.h View File

@@ -974,7 +974,7 @@ static void conv3x3s1_winograd42_pack8to1_int8_sse(const Mat& bottom_blob, Mat&
_sum7 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl3, _sh3));

k0 += 8;
r0 += 16;
r0 += 32;
}

_sum0 = _mm_add_epi32(_sum0, _sum1);


+ 6
- 8
src/layer/x86/convolution_sgemm_int8.h View File

@@ -940,8 +940,7 @@ static void im2col_sgemm_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const
{
__m128i _val = _mm_loadl_epi64((const __m128i*)tmpptr);
__m128i _extval = _mm_cmpgt_epi8(_mm_setzero_si128(), _val);
__m128i _val0 = _mm_unpacklo_epi8(_val, _extval);
__m128i _val1 = _mm_unpacklo_epi8(_val, _extval);
__m128i _val01 = _mm_unpacklo_epi8(_val, _extval);

__m128i _w0123 = _mm_loadl_epi64((const __m128i*)kptr0);
#if __SSE4_1__
@@ -950,14 +949,13 @@ static void im2col_sgemm_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const
__m128i _extw = _mm_cmpgt_epi8(_mm_setzero_si128(), _w0123);
__m128i _w = _mm_unpacklo_epi8(_w0123, _extw);
#endif
_w = _mm_shuffle_epi32(_w, _MM_SHUFFLE(1, 0, 1, 0));

__m128i _sl0 = _mm_mullo_epi16(_val0, _w);
__m128i _sh0 = _mm_mulhi_epi16(_val0, _w);
__m128i _sl1 = _mm_mullo_epi16(_val1, _w);
__m128i _sh1 = _mm_mulhi_epi16(_val1, _w);
__m128i _sl01 = _mm_mullo_epi16(_val01, _w);
__m128i _sh01 = _mm_mulhi_epi16(_val01, _w);

_sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0));
_sum1 = _mm_add_epi32(_sum1, _mm_unpacklo_epi16(_sl1, _sh1));
_sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl01, _sh01));
_sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl01, _sh01));

tmpptr += 8;
kptr0 += 4;


+ 1
- 1
src/layer/x86/convolution_sgemm_pack1to4_int8.h View File

@@ -266,7 +266,7 @@ static void im2col_sgemm_pack1to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl
for (; i + 3 < size; i += 4)
{
const signed char* tmpptr = tmp.channel(i / 4);
const signed char* kptr0 = kernel.channel(p / 4);
const signed char* kptr0 = kernel.channel(p);

int nn4 = (inch / 4) * maxk;
int nn1 = (inch % 4) * maxk;


+ 1
- 1
src/layer/x86/convolution_sgemm_pack8to1_int8.h View File

@@ -704,7 +704,7 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl
_sum4_6 = _mm256_add_epi32(_sum4_6, _sum5_7);
__m128i _sum0 = _mm256_extracti128_si256(_sum0_2, 0);
__m128i _sum2 = _mm256_extracti128_si256(_sum0_2, 1);
__m128i _sum4 = _mm256_extracti128_si256(_sum4_6, 1);
__m128i _sum4 = _mm256_extracti128_si256(_sum4_6, 0);
__m128i _sum6 = _mm256_extracti128_si256(_sum4_6, 1);

outptr0[0] = _mm_reduce_add_epi32(_sum0);


+ 8
- 8
src/layer/x86/convolution_x86.cpp View File

@@ -1342,10 +1342,10 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
{
convolution_im2col_sgemm_transform_kernel_pack8to1_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
}
else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_winograd42_transform_kernel_pack8to1_int8_sse(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt);
}
// else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
// {
// conv3x3s1_winograd42_transform_kernel_pack8to1_int8_sse(weight_data, weight_3x3_winograd42_data, num_input, num_output, opt);
// }
else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
{
convolution_im2col_sgemm_transform_kernel_pack8to1_int8_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
@@ -1569,10 +1569,10 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
{
conv1x1s2_sgemm_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
}
else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
conv3x3s1_winograd42_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_3x3_winograd42_data, opt);
}
// else if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
// {
// conv3x3s1_winograd42_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_3x3_winograd42_data, opt);
// }
else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
{
convolution_im2col_sgemm_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);


Loading…
Cancel
Save