fix too many microtask error in old libomp runtime (#4002)

4 years ago · f1ea792b26
--- a/src/layer/convolutiondepthwise.cpp
+++ b/src/layer/convolutiondepthwise.cpp
@@ -245,6 +245,10 @@ static int convolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const Mat
                float* outptr = top_blob.channel(g * outch_g + p);
                const float* weight_data_ptr = (const float*)weight_data + maxk * inch_g * outch_g * g;

                // shadowed variable for less openmp task args
                const int outw = top_blob.w;
                const int outh = top_blob.h;

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
--- a/src/layer/convolutiondepthwise3d.cpp
+++ b/src/layer/convolutiondepthwise3d.cpp
@@ -181,6 +181,11 @@ int ConvolutionDepthWise3D::forward(const Mat& bottom_blob, Mat& top_blob, const
                float* outptr = top_blob.channel(g * num_output_g + p);
                const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g;

                // shadowed variable for less openmp task args
                const int outw = top_blob.w;
                const int outh = top_blob.h;
                const int outd = top_blob.d;

                for (int z = 0; z < outd; z++)
                {
                    for (int i = 0; i < outh; i++)
--- a/src/layer/deconvolution.cpp
+++ b/src/layer/deconvolution.cpp
@@ -67,16 +67,9 @@ int Deconvolution::load_model(const ModelBin& mb)

 static int deconvolution(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int inch = bottom_blob.c;

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int outch = top_blob.c;

    const int bias_term = bias_data.empty() ? 0 : 1;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
@@ -103,10 +96,17 @@ static int deconvolution(const Mat& bottom_blob, Mat& top_blob, const Mat& weigh
    {
        Mat out = top_blob.channel(p);

        const float bias = bias_term ? bias_data[p] : 0.f;
        const float bias = bias_data.empty() ? 0.f : bias_data[p];

        out.fill(bias);

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int inch = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < h; i++)
        {
            for (int j = 0; j < w; j++)
--- a/src/layer/deconvolution3d.cpp
+++ b/src/layer/deconvolution3d.cpp
@@ -74,18 +74,10 @@ int Deconvolution3D::load_model(const ModelBin& mb)

 static int deconvolution3d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int kernel_d, int stride_w, int stride_h, int stride_d, int dilation_w, int dilation_h, int dilation_d, int activation_type, const Mat& activation_params, const Option& opt)
 {
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int d = bottom_blob.d;
    const int inch = bottom_blob.c;

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int outd = top_blob.d;
    const int outch = top_blob.c;

    const int bias_term = bias_data.empty() ? 0 : 1;

    const int maxk = kernel_w * kernel_h * kernel_d;

    // kernel offsets
@@ -117,10 +109,19 @@ static int deconvolution3d(const Mat& bottom_blob, Mat& top_blob, const Mat& wei
    {
        Mat out = top_blob.channel(p);

        const float bias = bias_term ? bias_data[p] : 0.f;
        const float bias = bias_data.empty() ? 0.f : bias_data[p];

        out.fill(bias);

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int d = bottom_blob.d;
        const int inch = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;
        const int outd = top_blob.d;

        for (int z = 0; z < d; z++)
        {
            for (int i = 0; i < h; i++)
--- a/src/layer/deconvolutiondepthwise.cpp
+++ b/src/layer/deconvolutiondepthwise.cpp
@@ -68,16 +68,11 @@ int DeconvolutionDepthWise::load_model(const ModelBin& mb)

 static int deconvolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, int group, int activation_type, const Mat& activation_params, const Option& opt)
 {
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int inch = bottom_blob.c;

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int outch = top_blob.c;

    const int bias_term = bias_data.empty() ? 0 : 1;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
@@ -109,10 +104,16 @@ static int deconvolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const M
            const float* kptr = (const float*)weight_data + maxk * g;
            Mat out = top_blob.channel(g);

            const float bias = bias_term ? bias_data[g] : 0.f;
            const float bias = bias_data.empty() ? 0.f : bias_data[g];

            out.fill(bias);

            // shadowed variable for less openmp task args
            const int w = bottom_blob.w;
            const int h = bottom_blob.h;
            const int outw = top_blob.w;
            const int outh = top_blob.h;

            for (int i = 0; i < h; i++)
            {
                for (int j = 0; j < w; j++)
@@ -157,10 +158,17 @@ static int deconvolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const M
                Mat out = top_blob.channel(g * outch_g + p);

                const float* weight_data_ptr = (const float*)weight_data + maxk * inch_g * outch_g * g;
                const float bias = bias_term ? bias_data[g * outch_g + p] : 0.f;

                const float bias = bias_data.empty() ? 0.f : bias_data[g * outch_g + p];

                out.fill(bias);

                // shadowed variable for less openmp task args
                const int w = bottom_blob.w;
                const int h = bottom_blob.h;
                const int outw = top_blob.w;
                const int outh = top_blob.h;

                for (int i = 0; i < h; i++)
                {
                    for (int j = 0; j < w; j++)
--- a/src/layer/deconvolutiondepthwise3d.cpp
+++ b/src/layer/deconvolutiondepthwise3d.cpp
@@ -75,18 +75,12 @@ int DeconvolutionDepthWise3D::load_model(const ModelBin& mb)

 static int deconvolutiondepthwise3d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int kernel_d, int stride_w, int stride_h, int stride_d, int dilation_w, int dilation_h, int dilation_d, int group, int activation_type, const Mat& activation_params, const Option& opt)
 {
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int d = bottom_blob.d;
    const int inch = bottom_blob.c;

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int outd = top_blob.d;
    const int outch = top_blob.c;

    const int bias_term = bias_data.empty() ? 0 : 1;

    const int maxk = kernel_w * kernel_h * kernel_d;

    // kernel offsets
@@ -123,10 +117,18 @@ static int deconvolutiondepthwise3d(const Mat& bottom_blob, Mat& top_blob, const
            const float* kptr = (const float*)weight_data + maxk * g;
            Mat out = top_blob.channel(g);

            const float bias = bias_term ? bias_data[g] : 0.f;
            const float bias = bias_data.empty() ? 0.f : bias_data[g];

            out.fill(bias);

            // shadowed variable for less openmp task args
            const int w = bottom_blob.w;
            const int h = bottom_blob.h;
            const int d = bottom_blob.d;
            const int outw = top_blob.w;
            const int outh = top_blob.h;
            const int outd = top_blob.d;

            for (int z = 0; z < d; z++)
            {
                for (int i = 0; i < h; i++)
@@ -174,10 +176,19 @@ static int deconvolutiondepthwise3d(const Mat& bottom_blob, Mat& top_blob, const
                Mat out = top_blob.channel(g * outch_g + p);

                const float* weight_data_ptr = (const float*)weight_data + maxk * inch_g * outch_g * g;
                const float bias = bias_term ? bias_data[g * outch_g + p] : 0.f;

                const float bias = bias_data.empty() ? 0.f : bias_data[g * outch_g + p];

                out.fill(bias);

                // shadowed variable for less openmp task args
                const int w = bottom_blob.w;
                const int h = bottom_blob.h;
                const int d = bottom_blob.d;
                const int outw = top_blob.w;
                const int outh = top_blob.h;
                const int outd = top_blob.d;

                for (int z = 0; z < d; z++)
                {
                    for (int i = 0; i < h; i++)
--- a/src/layer/x86/deconvolution_pack16.h
+++ b/src/layer/x86/deconvolution_pack16.h
@@ -14,19 +14,11 @@

 static void deconvolution_pack16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack16_avx512(const Mat& bottom_blob, Mat& top_blob, c
    {
        float* outptr = top_blob.channel(p);

        const int maxk = kernel_w * kernel_h;

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int channels = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
--- a/src/layer/x86/deconvolution_pack16to1.h
+++ b/src/layer/x86/deconvolution_pack16to1.h
@@ -14,19 +14,11 @@

 static void deconvolution_pack16to1_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack16to1_avx512(const Mat& bottom_blob, Mat& top_blob
    {
        float* outptr = top_blob.channel(p);

        const int maxk = kernel_w * kernel_h;

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int channels = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
--- a/src/layer/x86/deconvolution_pack16to4.h
+++ b/src/layer/x86/deconvolution_pack16to4.h
@@ -14,19 +14,11 @@

 static void deconvolution_pack16to4_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack16to4_avx512(const Mat& bottom_blob, Mat& top_blob
    {
        float* outptr = top_blob.channel(p);

        const int maxk = kernel_w * kernel_h;

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int channels = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
--- a/src/layer/x86/deconvolution_pack16to8.h
+++ b/src/layer/x86/deconvolution_pack16to8.h
@@ -14,19 +14,11 @@

 static void deconvolution_pack16to8_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack16to8_avx512(const Mat& bottom_blob, Mat& top_blob
    {
        float* outptr = top_blob.channel(p);

        const int maxk = kernel_w * kernel_h;

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int channels = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
--- a/src/layer/x86/deconvolution_pack1to16.h
+++ b/src/layer/x86/deconvolution_pack1to16.h
@@ -14,19 +14,11 @@

 static void deconvolution_pack1to16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack1to16_avx512(const Mat& bottom_blob, Mat& top_blob
    {
        float* outptr = top_blob.channel(p);

        const int maxk = kernel_w * kernel_h;

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int channels = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
--- a/src/layer/x86/deconvolution_pack1to4.h
+++ b/src/layer/x86/deconvolution_pack1to4.h
@@ -14,19 +14,11 @@

 static void deconvolution_pack1to4_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack1to4_sse(const Mat& bottom_blob, Mat& top_blob, co
    {
        float* outptr = top_blob.channel(p);

        const int maxk = kernel_w * kernel_h;

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int channels = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
--- a/src/layer/x86/deconvolution_pack1to8.h
+++ b/src/layer/x86/deconvolution_pack1to8.h
@@ -14,19 +14,11 @@

 static void deconvolution_pack1to8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack1to8_avx(const Mat& bottom_blob, Mat& top_blob, co
    {
        float* outptr = top_blob.channel(p);

        const int maxk = kernel_w * kernel_h;

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int channels = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
--- a/src/layer/x86/deconvolution_pack4.h
+++ b/src/layer/x86/deconvolution_pack4.h
@@ -14,19 +14,11 @@

 static void deconvolution_pack4_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack4_sse(const Mat& bottom_blob, Mat& top_blob, const
    {
        float* outptr = top_blob.channel(p);

        const int maxk = kernel_w * kernel_h;

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int channels = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
--- a/src/layer/x86/deconvolution_pack4to1.h
+++ b/src/layer/x86/deconvolution_pack4to1.h
@@ -14,19 +14,11 @@

 static void deconvolution_pack4to1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack4to1_sse(const Mat& bottom_blob, Mat& top_blob, co
    {
        float* outptr = top_blob.channel(p);

        const int maxk = kernel_w * kernel_h;

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int channels = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
--- a/src/layer/x86/deconvolution_pack4to16.h
+++ b/src/layer/x86/deconvolution_pack4to16.h
@@ -14,19 +14,11 @@

 static void deconvolution_pack4to16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack4to16_avx512(const Mat& bottom_blob, Mat& top_blob
    {
        float* outptr = top_blob.channel(p);

        const int maxk = kernel_w * kernel_h;

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int channels = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
--- a/src/layer/x86/deconvolution_pack4to8.h
+++ b/src/layer/x86/deconvolution_pack4to8.h
@@ -14,19 +14,11 @@

 static void deconvolution_pack4to8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack4to8_avx(const Mat& bottom_blob, Mat& top_blob, co
    {
        float* outptr = top_blob.channel(p);

        const int maxk = kernel_w * kernel_h;

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int channels = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
--- a/src/layer/x86/deconvolution_pack8.h
+++ b/src/layer/x86/deconvolution_pack8.h
@@ -14,19 +14,11 @@

 static void deconvolution_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const
    {
        float* outptr = top_blob.channel(p);

        const int maxk = kernel_w * kernel_h;

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int channels = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
--- a/src/layer/x86/deconvolution_pack8to1.h
+++ b/src/layer/x86/deconvolution_pack8to1.h
@@ -14,19 +14,11 @@

 static void deconvolution_pack8to1_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack8to1_avx(const Mat& bottom_blob, Mat& top_blob, co
    {
        float* outptr = top_blob.channel(p);

        const int maxk = kernel_w * kernel_h;

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int channels = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
--- a/src/layer/x86/deconvolution_pack8to16.h
+++ b/src/layer/x86/deconvolution_pack8to16.h
@@ -14,19 +14,11 @@

 static void deconvolution_pack8to16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack8to16_avx512(const Mat& bottom_blob, Mat& top_blob
    {
        float* outptr = top_blob.channel(p);

        const int maxk = kernel_w * kernel_h;

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int channels = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
--- a/src/layer/x86/deconvolution_pack8to4.h
+++ b/src/layer/x86/deconvolution_pack8to4.h
@@ -14,19 +14,11 @@

 static void deconvolution_pack8to4_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack8to4_avx(const Mat& bottom_blob, Mat& top_blob, co
    {
        float* outptr = top_blob.channel(p);

        const int maxk = kernel_w * kernel_h;

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int channels = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
--- a/src/layer/x86/deconvolution_x86.cpp
+++ b/src/layer/x86/deconvolution_x86.cpp
@@ -311,6 +311,13 @@ int Deconvolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
            {
                float* outptr = top_blob_bordered.channel(p);

                // shadowed variable for less openmp task args
                const int w = bottom_blob.w;
                const int h = bottom_blob.h;
                const int channels = bottom_blob.c;
                const int outw = top_blob_bordered.w;
                const int outh = top_blob_bordered.h;

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)