Browse Source

x86 optimization for winograd unified elempack part2 (#4470)

* improve gemm packb threading
* optimize tile size
* profile winograd condition
* handle threads changes
tags/20230223
nihui GitHub 3 years ago
parent
commit
bd5bbe3f2c
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 1791 additions and 1622 deletions
  1. +1549
    -1521
      src/layer/x86/convolution_3x3_winograd.h
  2. +186
    -58
      src/layer/x86/convolution_x86.cpp
  3. +1
    -0
      src/layer/x86/convolution_x86.h
  4. +55
    -43
      src/layer/x86/gemm_x86.cpp

+ 1549
- 1521
src/layer/x86/convolution_3x3_winograd.h
File diff suppressed because it is too large
View File


+ 186
- 58
src/layer/x86/convolution_x86.cpp View File

@@ -103,6 +103,7 @@ Convolution_x86::Convolution_x86()
#endif // __SSE2__

activation = 0;
nT = 0;
convolution_dilation1 = 0;
gemm = 0;
}
@@ -143,12 +144,177 @@ static void convolution_transform_kernel_packed_sse(const Mat& weight_data, Mat&
}
}

static bool test_prefer_winograd63(int num_input, int num_output, int w, int h)
{
// winograd selection strategy (profiled on i7-7700 single thread)

int minwh = std::min(w, h);

if (num_input >= 64)
{
return false;
}
if (num_input >= 32)
{
if (num_output >= 64) return false;
if (num_output >= 32) return (minwh >= 11 && minwh <= 14)
|| (minwh >= 19 && minwh <= 20)
|| (minwh >= 23 && minwh <= 44)
|| (minwh >= 47 && minwh <= 56)
|| (minwh >= 63 && minwh <= 130);
if (num_output >= 16) return (minwh >= 13 && minwh <= 14)
|| (minwh >= 19 && minwh <= 20)
|| (minwh >= 23 && minwh <= 38)
|| (minwh >= 43 && minwh <= 44)
|| (minwh >= 47 && minwh <= 140);
if (num_output >= 8) return (minwh >= 11 && minwh <= 14)
|| (minwh >= 19 && minwh <= 20)
|| (minwh >= 31 && minwh <= 38)
|| (minwh >= 43 && minwh <= 44)
|| (minwh >= 55 && minwh <= 162);
return false;
}
if (num_input >= 16)
{
if (num_output >= 64) return false;
if (num_output >= 32) return (minwh >= 11 && minwh <= 14)
|| (minwh >= 19 && minwh <= 20)
|| (minwh >= 23 && minwh <= 44)
|| (minwh >= 47 && minwh <= 92)
|| (minwh >= 95 && minwh <= 188);
if (num_output >= 16) return (minwh >= 11 && minwh <= 14)
|| (minwh >= 27 && minwh <= 38)
|| (minwh >= 43 && minwh <= 44)
|| (minwh >= 47 && minwh <= 74)
|| (minwh >= 81 && minwh <= 110)
|| (minwh >= 117 && minwh <= 170)
|| (minwh >= 177 && minwh <= 182);
if (num_output >= 8) return (minwh >= 19 && minwh <= 20)
|| (minwh >= 33 && minwh <= 38)
|| (minwh >= 43 && minwh <= 44)
|| (minwh >= 47 && minwh <= 128)
|| (minwh >= 155 && minwh <= 210);
return false;
}
if (num_input >= 8)
{
if (num_output >= 64) return false;
if (num_output >= 32) return (minwh >= 7 && minwh <= 14)
|| (minwh >= 17 && minwh <= 20)
|| (minwh >= 23 && minwh <= 26)
|| (minwh >= 31 && minwh <= 38)
|| (minwh >= 43 && minwh <= 162);
if (num_output >= 16) return minwh == 31 || minwh == 32
|| (minwh >= 39 && minwh <= 44)
|| (minwh >= 47 && minwh <= 212);
if (num_output >= 8) return false;
return false;
}

return false;
}

static bool test_prefer_winograd23(int num_input, int num_output, int w, int h)
{
int minwh = std::min(w, h);

if (num_input >= 512)
{
if (num_output >= 512) return (minwh >= 3 && minwh <= 14);
if (num_output >= 256) return (minwh >= 3 && minwh <= 14);
if (num_output >= 128) return (minwh >= 3 && minwh <= 14);
if (num_output >= 64) return (minwh >= 3 && minwh <= 8) || (minwh >= 11 && minwh <= 12);
if (num_output >= 32) return (minwh >= 3 && minwh <= 8);
if (num_output >= 16) return (minwh >= 3 && minwh <= 8);
if (num_output >= 8) return (minwh >= 3 && minwh <= 6);
return false;
}
if (num_input >= 256)
{
if (num_output >= 512) return (minwh >= 3 && minwh <= 14);
if (num_output >= 256) return (minwh >= 3 && minwh <= 14);
if (num_output >= 128) return (minwh >= 3 && minwh <= 12);
if (num_output >= 64) return (minwh >= 3 && minwh <= 4);
if (num_output >= 32) return (minwh >= 3 && minwh <= 8);
if (num_output >= 16) return (minwh >= 3 && minwh <= 8);
if (num_output >= 8) return (minwh >= 3 && minwh <= 6);
return false;
}
if (num_input >= 128)
{
if (num_output >= 512) return (minwh >= 3 && minwh <= 14);
if (num_output >= 256) return (minwh >= 3 && minwh <= 8) || (minwh >= 11 && minwh <= 12);
if (num_output >= 128) return (minwh >= 3 && minwh <= 10);
if (num_output >= 64) return (minwh >= 3 && minwh <= 8);
if (num_output >= 32) return (minwh >= 3 && minwh <= 10);
if (num_output >= 16) return (minwh >= 3 && minwh <= 6);
if (num_output >= 8) return (minwh >= 3 && minwh <= 6);
return false;
}
if (num_input >= 64)
{
if (num_output >= 512) return (minwh >= 3 && minwh <= 8) || (minwh >= 11 && minwh <= 12) || (minwh >= 15 && minwh <= 20);
if (num_output >= 256) return (minwh >= 7 && minwh <= 8);
if (num_output >= 128) return (minwh >= 3 && minwh <= 8) || (minwh >= 19 && minwh <= 22);
if (num_output >= 64) return (minwh >= 3 && minwh <= 12);
if (num_output >= 32) return (minwh >= 3 && minwh <= 12);
if (num_output >= 16) return (minwh >= 3 && minwh <= 12);
if (num_output >= 8) return (minwh >= 3 && minwh <= 12);
return false;
}
if (num_input >= 32)
{
if (num_output >= 512) return (minwh >= 3 && minwh <= 6) || (minwh >= 11 && minwh <= 12);
if (num_output >= 256) return (minwh >= 3 && minwh <= 6) || (minwh >= 11 && minwh <= 12);
if (num_output >= 128) return (minwh >= 3 && minwh <= 4) || (minwh >= 7 && minwh <= 16);
if (num_output >= 64) return (minwh >= 3 && minwh <= 8);
if (num_output >= 32) return (minwh >= 7 && minwh <= 8);
if (num_output >= 16) return (minwh >= 7 && minwh <= 8);
if (num_output >= 8) return (minwh >= 3 && minwh <= 10);
return false;
}
if (num_input >= 16)
{
if (num_output >= 512) return (minwh >= 11 && minwh <= 12);
if (num_output >= 256) return (minwh >= 3 && minwh <= 12);
if (num_output >= 128) return (minwh >= 3 && minwh <= 6)
|| (minwh >= 9 && minwh <= 18);
if (num_output >= 64) return (minwh >= 3 && minwh <= 4)
|| (minwh >= 7 && minwh <= 8)
|| (minwh >= 11 && minwh <= 12)
|| (minwh >= 15 && minwh <= 18);
if (num_output >= 32) return (minwh >= 3 && minwh <= 4)
|| (minwh >= 9 && minwh <= 10);
if (num_output >= 16) return (minwh >= 3 && minwh <= 10);
if (num_output >= 8) return (minwh >= 3 && minwh <= 8)
|| (minwh >= 11 && minwh <= 12);
return false;
}
if (num_input >= 8)
{
if (num_output >= 128) return false;
if (num_output >= 64) return (minwh >= 3 && minwh <= 4)
|| (minwh >= 7 && minwh <= 14)
|| (minwh >= 47 && minwh <= 48);
if (num_output >= 32) return (minwh >= 3 && minwh <= 6)
|| (minwh >= 15 && minwh <= 16);
if (num_output >= 16) return (minwh >= 3 && minwh <= 6)
|| (minwh >= 9 && minwh <= 14)
|| (minwh >= 47 && minwh <= 212);
if (num_output >= 8) return true;
return false;
}

return false;
}

int Convolution_x86::create_pipeline(const Option& opt)
{
if (dynamic_weight)
return 0;

activation = create_activation_layer(activation_type, activation_params, opt);
nT = opt.num_threads;

#if NCNN_INT8
if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
@@ -226,14 +392,14 @@ int Convolution_x86::create_pipeline(const Option& opt)
}
#endif // __SSE2__

bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && num_input >= 16 && num_output >= 16;
bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input > 8 || num_output > 8);

if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
if ((bottom_shapes.empty() || bottom_shapes[0].w == 0 || bottom_shapes[0].h == 0) && (top_shapes.empty() || top_shapes[0].w == 0 || top_shapes[0].h == 0))
{
// dynamic shape
if (opt.use_winograd63_convolution && num_input <= 24 && num_output <= 24)
if ((opt.use_winograd63_convolution) && (num_input <= 32 && num_output <= 32))
conv3x3s1_winograd63_transform_kernel(weight_data, weight_winograd63_data, num_input, num_output, opt);
else if (opt.use_winograd43_convolution)
conv3x3s1_winograd43_transform_kernel(weight_data, weight_winograd43_data, num_input, num_output, opt);
@@ -242,20 +408,6 @@ int Convolution_x86::create_pipeline(const Option& opt)
}
else
{
// winograd selection strategy
//
// | | | | | c/outc
// | | | | | f63 ^
// | | | | | +----------------+128
// | | | | |f63| f43
// |f23|f43|f63|f43| +---+ +64
// | | | | | f63 | f43
// | | | | | +---+ +32
// | | | | | f63 | f43
// +---+---+---+---+---+---+---+--------+16
// 0 14 19 21 31 96 132 192 --> wh
//

int w;
int h;
if (top_shapes.empty() || top_shapes[0].w == 0 || top_shapes[0].h == 0)
@@ -283,18 +435,9 @@ int Convolution_x86::create_pipeline(const Option& opt)
h = top_shapes[0].h + 2;
}

const int minwh = std::min(w, h);

bool prefer_winograd63 = minwh == 19 || minwh == 20
|| (minwh > 30 && num_input >= 128)
|| (minwh > 30 && num_input >= 64 && num_input < 128 && num_output >= 128)
|| (minwh > 30 && num_input >= 64 && num_input < 128 && num_output < 128 && minwh < 96)
|| (minwh > 30 && num_input >= 16 && num_input < 64 && num_output >= 64)
|| (minwh > 30 && num_input >= 32 && num_input < 64 && num_output < 64 && minwh < 132)
|| (minwh > 30 && num_input >= 16 && num_input < 32 && num_output < 64 && minwh < 192);

bool prefer_winograd43 = (minwh > 14 && !prefer_winograd63);
bool prefer_winograd23 = (!prefer_winograd43 && !prefer_winograd63);
bool prefer_winograd63 = test_prefer_winograd63(num_input, num_output, w, h);
bool prefer_winograd23 = test_prefer_winograd23(num_input, num_output, w, h);
bool prefer_winograd43 = !prefer_winograd63 && !prefer_winograd23;

if (prefer_winograd23 && !opt.use_winograd23_convolution)
{
@@ -557,36 +700,13 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option

const int num_input = channels * elempack;

bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && num_input >= 16 && num_output >= 16;
bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input > 8 || num_output > 8);

if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
{
// winograd selection strategy
//
// | | | | | c/outc
// | | | | | f63 ^
// | | | | | +----------------+128
// | | | | |f63| f43
// |f23|f43|f63|f43| +---+ +64
// | | | | | f63 | f43
// | | | | | +---+ +32
// | | | | | f63 | f43
// +---+---+---+---+---+---+---+--------+16
// 0 14 19 21 31 96 132 192 --> wh
//

const int minwh = std::min(w, h);

bool prefer_winograd63 = minwh == 19 || minwh == 20
|| (minwh > 30 && num_input >= 128)
|| (minwh > 30 && num_input >= 64 && num_input < 128 && num_output >= 128)
|| (minwh > 30 && num_input >= 64 && num_input < 128 && num_output < 128 && minwh < 96)
|| (minwh > 30 && num_input >= 16 && num_input < 64 && num_output >= 64)
|| (minwh > 30 && num_input >= 32 && num_input < 64 && num_output < 64 && minwh < 132)
|| (minwh > 30 && num_input >= 16 && num_input < 32 && num_output < 64 && minwh < 192);

bool prefer_winograd43 = (minwh > 14 && !prefer_winograd63);
bool prefer_winograd23 = (!prefer_winograd43 && !prefer_winograd63);
bool prefer_winograd63 = test_prefer_winograd63(num_input, num_output, w, h);
bool prefer_winograd23 = test_prefer_winograd23(num_input, num_output, w, h);
bool prefer_winograd43 = !prefer_winograd63 && !prefer_winograd23;

if (prefer_winograd23 && (!opt.use_winograd23_convolution || weight_winograd23_data.empty()))
{
@@ -616,17 +736,25 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
}
}

int _nT = nT ? nT : opt.num_threads;
if (nT != 0 && opt.num_threads != nT)
{
// force num_threads the same as in create_pipeline
// so we could use pre-packed A/B from the same tile config
NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT);
}

if (prefer_winograd23)
{
conv3x3s1_winograd23(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt);
conv3x3s1_winograd23(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt);
}
else if (prefer_winograd43)
{
conv3x3s1_winograd43(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt);
conv3x3s1_winograd43(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt);
}
else if (prefer_winograd63)
{
conv3x3s1_winograd63(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, opt);
conv3x3s1_winograd63(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt);
}
else
{


+ 1
- 0
src/layer/x86/convolution_x86.h View File

@@ -41,6 +41,7 @@ protected:
public:
Layer* activation;

int nT;
Mat weight_data_tm;
Mat weight_sgemm_data;
Mat weight_winograd23_data;


+ 55
- 43
src/layer/x86/gemm_x86.cpp View File

@@ -5873,6 +5873,7 @@ static int gemm_x86(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int

int nn_M = (M + TILE_M - 1) / TILE_M;
int nn_N = (N + TILE_N - 1) / TILE_N;
int nn_K = (K + TILE_K - 1) / TILE_K;

Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 4u, opt.blob_allocator);
Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.blob_allocator);
@@ -5881,27 +5882,30 @@ static int gemm_x86(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int
if (K > TILE_K)
tmpX.create(TILE_N, TILE_M, nT, 4u, opt.blob_allocator);

const int nn_NK = nn_N * nn_K;

// pack B
#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_N; ppj++)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
const int ppj = ppjk / nn_K;
const int ppk = ppjk % nn_K;

const int j = ppj * TILE_N;
const int k = ppk * TILE_K;

for (int k = 0; k < K; k += TILE_K)
{
const int max_jj = std::min((N - j), TILE_N);
const int max_kk = std::min((K - k), TILE_K);
const int max_jj = std::min((N - j), TILE_N);
const int max_kk = std::min((K - k), TILE_K);

Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);
Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

if (transB)
{
pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
}
else
{
transpose_pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
}
if (transB)
{
pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
}
else
{
transpose_pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
}
}

@@ -5965,6 +5969,7 @@ static int gemm_AT_x86(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob,

int nn_M = (M + TILE_M - 1) / TILE_M;
int nn_N = (N + TILE_N - 1) / TILE_N;
int nn_K = (K + TILE_K - 1) / TILE_K;

Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.blob_allocator);

@@ -5972,27 +5977,30 @@ static int gemm_AT_x86(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob,
if (K > TILE_K)
tmpX.create(TILE_N, TILE_M, nT, 4u, opt.blob_allocator);

const int nn_NK = nn_N * nn_K;

// pack B
#pragma omp parallel for num_threads(nT)
for (int ppj = 0; ppj < nn_N; ppj++)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
const int ppj = ppjk / nn_K;
const int ppk = ppjk % nn_K;

const int j = ppj * TILE_N;
const int k = ppk * TILE_K;

for (int k = 0; k < K; k += TILE_K)
{
const int max_jj = std::min((N - j), TILE_N);
const int max_kk = std::min((K - k), TILE_K);
const int max_jj = std::min((N - j), TILE_N);
const int max_kk = std::min((K - k), TILE_K);

Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);
Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

if (transB)
{
pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
}
else
{
transpose_pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
}
if (transB)
{
pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
}
else
{
transpose_pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
}
}

@@ -6203,31 +6211,35 @@ int Gemm_x86::create_pipeline(const Option& opt)
get_optimal_tile_mnk(0, N, K, TILE_M, TILE_N, TILE_K, opt.num_threads);

const int nn_N = (N + TILE_N - 1) / TILE_N;
const int nn_K = (K + TILE_K - 1) / TILE_K;

BT_data.create(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.blob_allocator);
if (BT_data.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int ppj = 0; ppj < nn_N; ppj++)
const int nn_NK = nn_N * nn_K;

#pragma omp parallel for num_threads(nT)
for (int ppjk = 0; ppjk < nn_NK; ppjk++)
{
const int ppj = ppjk / nn_K;
const int ppk = ppjk % nn_K;

const int j = ppj * TILE_N;
const int k = ppk * TILE_K;

for (int k = 0; k < K; k += TILE_K)
{
const int max_jj = std::min((N - j), TILE_N);
const int max_kk = std::min((K - k), TILE_K);
const int max_jj = std::min((N - j), TILE_N);
const int max_kk = std::min((K - k), TILE_K);

Mat BT_tile = BT_data.channel(j / TILE_N).row_range(k / TILE_K, 1);
Mat BT_tile = BT_data.channel(j / TILE_N).row_range(k / TILE_K, 1);

if (transB)
{
pack_B_tile(B_data, BT_tile, j, max_jj, k, max_kk);
}
else
{
transpose_pack_B_tile(B_data, BT_tile, j, max_jj, k, max_kk);
}
if (transB)
{
pack_B_tile(B_data, BT_tile, j, max_jj, k, max_kk);
}
else
{
transpose_pack_B_tile(B_data, BT_tile, j, max_jj, k, max_kk);
}
}



Loading…
Cancel
Save