x86 optimization for winograd unified elempack part2 (#4470)

* improve gemm packb threading * optimize tile size * profile winograd condition * handle threads changes
3 years ago · bd5bbe3f2c
--- a/src/layer/x86/convolution_3x3_winograd.h
+++ b/src/layer/x86/convolution_3x3_winograd.h
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -103,6 +103,7 @@ Convolution_x86::Convolution_x86()
 #endif // __SSE2__

    activation = 0;
    nT = 0;
    convolution_dilation1 = 0;
    gemm = 0;
 }
@@ -143,12 +144,177 @@ static void convolution_transform_kernel_packed_sse(const Mat& weight_data, Mat&
    }
 }

 static bool test_prefer_winograd63(int num_input, int num_output, int w, int h)
 {
    // winograd selection strategy (profiled on i7-7700 single thread)

    int minwh = std::min(w, h);

    if (num_input >= 64)
    {
        return false;
    }
    if (num_input >= 32)
    {
        if (num_output >= 64) return false;
        if (num_output >= 32) return (minwh >= 11 && minwh <= 14)
                                         || (minwh >= 19 && minwh <= 20)
                                         || (minwh >= 23 && minwh <= 44)
                                         || (minwh >= 47 && minwh <= 56)
                                         || (minwh >= 63 && minwh <= 130);
        if (num_output >= 16) return (minwh >= 13 && minwh <= 14)
                                         || (minwh >= 19 && minwh <= 20)
                                         || (minwh >= 23 && minwh <= 38)
                                         || (minwh >= 43 && minwh <= 44)
                                         || (minwh >= 47 && minwh <= 140);
        if (num_output >= 8) return (minwh >= 11 && minwh <= 14)
                                        || (minwh >= 19 && minwh <= 20)
                                        || (minwh >= 31 && minwh <= 38)
                                        || (minwh >= 43 && minwh <= 44)
                                        || (minwh >= 55 && minwh <= 162);
        return false;
    }
    if (num_input >= 16)
    {
        if (num_output >= 64) return false;
        if (num_output >= 32) return (minwh >= 11 && minwh <= 14)
                                         || (minwh >= 19 && minwh <= 20)
                                         || (minwh >= 23 && minwh <= 44)
                                         || (minwh >= 47 && minwh <= 92)
                                         || (minwh >= 95 && minwh <= 188);
        if (num_output >= 16) return (minwh >= 11 && minwh <= 14)
                                         || (minwh >= 27 && minwh <= 38)
                                         || (minwh >= 43 && minwh <= 44)
                                         || (minwh >= 47 && minwh <= 74)
                                         || (minwh >= 81 && minwh <= 110)
                                         || (minwh >= 117 && minwh <= 170)
                                         || (minwh >= 177 && minwh <= 182);
        if (num_output >= 8) return (minwh >= 19 && minwh <= 20)
                                        || (minwh >= 33 && minwh <= 38)
                                        || (minwh >= 43 && minwh <= 44)
                                        || (minwh >= 47 && minwh <= 128)
                                        || (minwh >= 155 && minwh <= 210);
        return false;
    }
    if (num_input >= 8)
    {
        if (num_output >= 64) return false;
        if (num_output >= 32) return (minwh >= 7 && minwh <= 14)
                                         || (minwh >= 17 && minwh <= 20)
                                         || (minwh >= 23 && minwh <= 26)
                                         || (minwh >= 31 && minwh <= 38)
                                         || (minwh >= 43 && minwh <= 162);
        if (num_output >= 16) return minwh == 31 || minwh == 32
                                         || (minwh >= 39 && minwh <= 44)
                                         || (minwh >= 47 && minwh <= 212);
        if (num_output >= 8) return false;
        return false;
    }

    return false;
 }

 static bool test_prefer_winograd23(int num_input, int num_output, int w, int h)
 {
    int minwh = std::min(w, h);

    if (num_input >= 512)
    {
        if (num_output >= 512) return (minwh >= 3 && minwh <= 14);
        if (num_output >= 256) return (minwh >= 3 && minwh <= 14);
        if (num_output >= 128) return (minwh >= 3 && minwh <= 14);
        if (num_output >= 64) return (minwh >= 3 && minwh <= 8) || (minwh >= 11 && minwh <= 12);
        if (num_output >= 32) return (minwh >= 3 && minwh <= 8);
        if (num_output >= 16) return (minwh >= 3 && minwh <= 8);
        if (num_output >= 8) return (minwh >= 3 && minwh <= 6);
        return false;
    }
    if (num_input >= 256)
    {
        if (num_output >= 512) return (minwh >= 3 && minwh <= 14);
        if (num_output >= 256) return (minwh >= 3 && minwh <= 14);
        if (num_output >= 128) return (minwh >= 3 && minwh <= 12);
        if (num_output >= 64) return (minwh >= 3 && minwh <= 4);
        if (num_output >= 32) return (minwh >= 3 && minwh <= 8);
        if (num_output >= 16) return (minwh >= 3 && minwh <= 8);
        if (num_output >= 8) return (minwh >= 3 && minwh <= 6);
        return false;
    }
    if (num_input >= 128)
    {
        if (num_output >= 512) return (minwh >= 3 && minwh <= 14);
        if (num_output >= 256) return (minwh >= 3 && minwh <= 8) || (minwh >= 11 && minwh <= 12);
        if (num_output >= 128) return (minwh >= 3 && minwh <= 10);
        if (num_output >= 64) return (minwh >= 3 && minwh <= 8);
        if (num_output >= 32) return (minwh >= 3 && minwh <= 10);
        if (num_output >= 16) return (minwh >= 3 && minwh <= 6);
        if (num_output >= 8) return (minwh >= 3 && minwh <= 6);
        return false;
    }
    if (num_input >= 64)
    {
        if (num_output >= 512) return (minwh >= 3 && minwh <= 8) || (minwh >= 11 && minwh <= 12) || (minwh >= 15 && minwh <= 20);
        if (num_output >= 256) return (minwh >= 7 && minwh <= 8);
        if (num_output >= 128) return (minwh >= 3 && minwh <= 8) || (minwh >= 19 && minwh <= 22);
        if (num_output >= 64) return (minwh >= 3 && minwh <= 12);
        if (num_output >= 32) return (minwh >= 3 && minwh <= 12);
        if (num_output >= 16) return (minwh >= 3 && minwh <= 12);
        if (num_output >= 8) return (minwh >= 3 && minwh <= 12);
        return false;
    }
    if (num_input >= 32)
    {
        if (num_output >= 512) return (minwh >= 3 && minwh <= 6) || (minwh >= 11 && minwh <= 12);
        if (num_output >= 256) return (minwh >= 3 && minwh <= 6) || (minwh >= 11 && minwh <= 12);
        if (num_output >= 128) return (minwh >= 3 && minwh <= 4) || (minwh >= 7 && minwh <= 16);
        if (num_output >= 64) return (minwh >= 3 && minwh <= 8);
        if (num_output >= 32) return (minwh >= 7 && minwh <= 8);
        if (num_output >= 16) return (minwh >= 7 && minwh <= 8);
        if (num_output >= 8) return (minwh >= 3 && minwh <= 10);
        return false;
    }
    if (num_input >= 16)
    {
        if (num_output >= 512) return (minwh >= 11 && minwh <= 12);
        if (num_output >= 256) return (minwh >= 3 && minwh <= 12);
        if (num_output >= 128) return (minwh >= 3 && minwh <= 6)
                                          || (minwh >= 9 && minwh <= 18);
        if (num_output >= 64) return (minwh >= 3 && minwh <= 4)
                                         || (minwh >= 7 && minwh <= 8)
                                         || (minwh >= 11 && minwh <= 12)
                                         || (minwh >= 15 && minwh <= 18);
        if (num_output >= 32) return (minwh >= 3 && minwh <= 4)
                                         || (minwh >= 9 && minwh <= 10);
        if (num_output >= 16) return (minwh >= 3 && minwh <= 10);
        if (num_output >= 8) return (minwh >= 3 && minwh <= 8)
                                        || (minwh >= 11 && minwh <= 12);
        return false;
    }
    if (num_input >= 8)
    {
        if (num_output >= 128) return false;
        if (num_output >= 64) return (minwh >= 3 && minwh <= 4)
                                         || (minwh >= 7 && minwh <= 14)
                                         || (minwh >= 47 && minwh <= 48);
        if (num_output >= 32) return (minwh >= 3 && minwh <= 6)
                                         || (minwh >= 15 && minwh <= 16);
        if (num_output >= 16) return (minwh >= 3 && minwh <= 6)
                                         || (minwh >= 9 && minwh <= 14)
                                         || (minwh >= 47 && minwh <= 212);
        if (num_output >= 8) return true;
        return false;
    }

    return false;
 }

 int Convolution_x86::create_pipeline(const Option& opt)
 {
    if (dynamic_weight)
        return 0;

    activation = create_activation_layer(activation_type, activation_params, opt);
    nT = opt.num_threads;

 #if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
@@ -226,14 +392,14 @@ int Convolution_x86::create_pipeline(const Option& opt)
    }
 #endif // __SSE2__

    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && num_input >= 16 && num_output >= 16;
    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input > 8 || num_output > 8);

    if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
        if ((bottom_shapes.empty() || bottom_shapes[0].w == 0 || bottom_shapes[0].h == 0) && (top_shapes.empty() || top_shapes[0].w == 0 || top_shapes[0].h == 0))
        {
            // dynamic shape
            if (opt.use_winograd63_convolution && num_input <= 24 && num_output <= 24)
            if ((opt.use_winograd63_convolution) && (num_input <= 32 && num_output <= 32))
                conv3x3s1_winograd63_transform_kernel(weight_data, weight_winograd63_data, num_input, num_output, opt);
            else if (opt.use_winograd43_convolution)
                conv3x3s1_winograd43_transform_kernel(weight_data, weight_winograd43_data, num_input, num_output, opt);
@@ -242,20 +408,6 @@ int Convolution_x86::create_pipeline(const Option& opt)
        }
        else
        {
            //  winograd selection strategy
            //
            //   |   |   |   |   |                    c/outc
            //   |   |   |   |   |        f63          ^
            //   |   |   |   |   |   +----------------+128
            //   |   |   |   |   |f63|    f43
            //   |f23|f43|f63|f43|   +---+            +64
            //   |   |   |   |   |  f63  |   f43
            //   |   |   |   |   |       +---+        +32
            //   |   |   |   |   |    f63    |   f43
            //   +---+---+---+---+---+---+---+--------+16
            //   0   14  19  21  31  96  132 192     --> wh
            //

            int w;
            int h;
            if (top_shapes.empty() || top_shapes[0].w == 0 || top_shapes[0].h == 0)
@@ -283,18 +435,9 @@ int Convolution_x86::create_pipeline(const Option& opt)
                h = top_shapes[0].h + 2;
            }

            const int minwh = std::min(w, h);

            bool prefer_winograd63 = minwh == 19 || minwh == 20
                                     || (minwh > 30 && num_input >= 128)
                                     || (minwh > 30 && num_input >= 64 && num_input < 128 && num_output >= 128)
                                     || (minwh > 30 && num_input >= 64 && num_input < 128 && num_output < 128 && minwh < 96)
                                     || (minwh > 30 && num_input >= 16 && num_input < 64 && num_output >= 64)
                                     || (minwh > 30 && num_input >= 32 && num_input < 64 && num_output < 64 && minwh < 132)
                                     || (minwh > 30 && num_input >= 16 && num_input < 32 && num_output < 64 && minwh < 192);

            bool prefer_winograd43 = (minwh > 14 && !prefer_winograd63);
            bool prefer_winograd23 = (!prefer_winograd43 && !prefer_winograd63);
            bool prefer_winograd63 = test_prefer_winograd63(num_input, num_output, w, h);
            bool prefer_winograd23 = test_prefer_winograd23(num_input, num_output, w, h);
            bool prefer_winograd43 = !prefer_winograd63 && !prefer_winograd23;

            if (prefer_winograd23 && !opt.use_winograd23_convolution)
            {
@@ -557,36 +700,13 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option

    const int num_input = channels * elempack;

    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && num_input >= 16 && num_output >= 16;
    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input > 8 || num_output > 8);

    if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
        //  winograd selection strategy
        //
        //   |   |   |   |   |                    c/outc
        //   |   |   |   |   |        f63          ^
        //   |   |   |   |   |   +----------------+128
        //   |   |   |   |   |f63|    f43
        //   |f23|f43|f63|f43|   +---+            +64
        //   |   |   |   |   |  f63  |   f43
        //   |   |   |   |   |       +---+        +32
        //   |   |   |   |   |    f63    |   f43
        //   +---+---+---+---+---+---+---+--------+16
        //   0   14  19  21  31  96  132 192     --> wh
        //

        const int minwh = std::min(w, h);

        bool prefer_winograd63 = minwh == 19 || minwh == 20
                                 || (minwh > 30 && num_input >= 128)
                                 || (minwh > 30 && num_input >= 64 && num_input < 128 && num_output >= 128)
                                 || (minwh > 30 && num_input >= 64 && num_input < 128 && num_output < 128 && minwh < 96)
                                 || (minwh > 30 && num_input >= 16 && num_input < 64 && num_output >= 64)
                                 || (minwh > 30 && num_input >= 32 && num_input < 64 && num_output < 64 && minwh < 132)
                                 || (minwh > 30 && num_input >= 16 && num_input < 32 && num_output < 64 && minwh < 192);

        bool prefer_winograd43 = (minwh > 14 && !prefer_winograd63);
        bool prefer_winograd23 = (!prefer_winograd43 && !prefer_winograd63);
        bool prefer_winograd63 = test_prefer_winograd63(num_input, num_output, w, h);
        bool prefer_winograd23 = test_prefer_winograd23(num_input, num_output, w, h);
        bool prefer_winograd43 = !prefer_winograd63 && !prefer_winograd23;

        if (prefer_winograd23 && (!opt.use_winograd23_convolution || weight_winograd23_data.empty()))
        {
@@ -616,17 +736,25 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
            }
        }

        int _nT = nT ? nT : opt.num_threads;
        if (nT != 0 && opt.num_threads != nT)
        {
            // force num_threads the same as in create_pipeline
            // so we could use pre-packed A/B from the same tile config
            NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT);
        }

        if (prefer_winograd23)
        {
            conv3x3s1_winograd23(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt);
            conv3x3s1_winograd23(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt);
        }
        else if (prefer_winograd43)
        {
            conv3x3s1_winograd43(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt);
            conv3x3s1_winograd43(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt);
        }
        else if (prefer_winograd63)
        {
            conv3x3s1_winograd63(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, opt);
            conv3x3s1_winograd63(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt);
        }
        else
        {
--- a/src/layer/x86/convolution_x86.h
+++ b/src/layer/x86/convolution_x86.h
@@ -41,6 +41,7 @@ protected:
 public:
    Layer* activation;

    int nT;
    Mat weight_data_tm;
    Mat weight_sgemm_data;
    Mat weight_winograd23_data;
--- a/src/layer/x86/gemm_x86.cpp
+++ b/src/layer/x86/gemm_x86.cpp
@@ -5873,6 +5873,7 @@ static int gemm_x86(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int

    int nn_M = (M + TILE_M - 1) / TILE_M;
    int nn_N = (N + TILE_N - 1) / TILE_N;
    int nn_K = (K + TILE_K - 1) / TILE_K;

    Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 4u, opt.blob_allocator);
    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.blob_allocator);
@@ -5881,27 +5882,30 @@ static int gemm_x86(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int
    if (K > TILE_K)
        tmpX.create(TILE_N, TILE_M, nT, 4u, opt.blob_allocator);

    const int nn_NK = nn_N * nn_K;

    // pack B
    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_N; ppj++)
    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
    {
        const int ppj = ppjk / nn_K;
        const int ppk = ppjk % nn_K;

        const int j = ppj * TILE_N;
        const int k = ppk * TILE_K;

        for (int k = 0; k < K; k += TILE_K)
        {
            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);
        const int max_jj = std::min((N - j), TILE_N);
        const int max_kk = std::min((K - k), TILE_K);

            Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);
        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

            if (transB)
            {
                pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
            }
            else
            {
                transpose_pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
            }
        if (transB)
        {
            pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
        }
        else
        {
            transpose_pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
        }
    }

@@ -5965,6 +5969,7 @@ static int gemm_AT_x86(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob,

    int nn_M = (M + TILE_M - 1) / TILE_M;
    int nn_N = (N + TILE_N - 1) / TILE_N;
    int nn_K = (K + TILE_K - 1) / TILE_K;

    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.blob_allocator);

@@ -5972,27 +5977,30 @@ static int gemm_AT_x86(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob,
    if (K > TILE_K)
        tmpX.create(TILE_N, TILE_M, nT, 4u, opt.blob_allocator);

    const int nn_NK = nn_N * nn_K;

    // pack B
    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_N; ppj++)
    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
    {
        const int ppj = ppjk / nn_K;
        const int ppk = ppjk % nn_K;

        const int j = ppj * TILE_N;
        const int k = ppk * TILE_K;

        for (int k = 0; k < K; k += TILE_K)
        {
            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);
        const int max_jj = std::min((N - j), TILE_N);
        const int max_kk = std::min((K - k), TILE_K);

            Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);
        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

            if (transB)
            {
                pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
            }
            else
            {
                transpose_pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
            }
        if (transB)
        {
            pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
        }
        else
        {
            transpose_pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
        }
    }

@@ -6203,31 +6211,35 @@ int Gemm_x86::create_pipeline(const Option& opt)
        get_optimal_tile_mnk(0, N, K, TILE_M, TILE_N, TILE_K, opt.num_threads);

        const int nn_N = (N + TILE_N - 1) / TILE_N;
        const int nn_K = (K + TILE_K - 1) / TILE_K;

        BT_data.create(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.blob_allocator);
        if (BT_data.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ppj = 0; ppj < nn_N; ppj++)
        const int nn_NK = nn_N * nn_K;

        #pragma omp parallel for num_threads(nT)
        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_jj = std::min((N - j), TILE_N);
                const int max_kk = std::min((K - k), TILE_K);
            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

                Mat BT_tile = BT_data.channel(j / TILE_N).row_range(k / TILE_K, 1);
            Mat BT_tile = BT_data.channel(j / TILE_N).row_range(k / TILE_K, 1);

                if (transB)
                {
                    pack_B_tile(B_data, BT_tile, j, max_jj, k, max_kk);
                }
                else
                {
                    transpose_pack_B_tile(B_data, BT_tile, j, max_jj, k, max_kk);
                }
            if (transB)
            {
                pack_B_tile(B_data, BT_tile, j, max_jj, k, max_kk);
            }
            else
            {
                transpose_pack_B_tile(B_data, BT_tile, j, max_jj, k, max_kk);
            }
        }