From f1ea792b264da8d3e0fd7ac02cf45d3053c9020d Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Wed, 13 Jul 2022 10:08:30 +0800
Subject: [PATCH] fix too many microtask error in old libomp runtime (#4002)

---
 src/layer/convolutiondepthwise.cpp      |  4 ++++
 src/layer/convolutiondepthwise3d.cpp    |  5 +++++
 src/layer/deconvolution.cpp             | 16 +++++++--------
 src/layer/deconvolution3d.cpp           | 19 ++++++++---------
 src/layer/deconvolutiondepthwise.cpp    | 22 +++++++++++++-------
 src/layer/deconvolutiondepthwise3d.cpp  | 27 +++++++++++++++++--------
 src/layer/x86/deconvolution_pack16.h    | 17 ++++++++--------
 src/layer/x86/deconvolution_pack16to1.h | 17 ++++++++--------
 src/layer/x86/deconvolution_pack16to4.h | 17 ++++++++--------
 src/layer/x86/deconvolution_pack16to8.h | 17 ++++++++--------
 src/layer/x86/deconvolution_pack1to16.h | 17 ++++++++--------
 src/layer/x86/deconvolution_pack1to4.h  | 17 ++++++++--------
 src/layer/x86/deconvolution_pack1to8.h  | 17 ++++++++--------
 src/layer/x86/deconvolution_pack4.h     | 17 ++++++++--------
 src/layer/x86/deconvolution_pack4to1.h  | 17 ++++++++--------
 src/layer/x86/deconvolution_pack4to16.h | 17 ++++++++--------
 src/layer/x86/deconvolution_pack4to8.h  | 17 ++++++++--------
 src/layer/x86/deconvolution_pack8.h     | 17 ++++++++--------
 src/layer/x86/deconvolution_pack8to1.h  | 17 ++++++++--------
 src/layer/x86/deconvolution_pack8to16.h | 17 ++++++++--------
 src/layer/x86/deconvolution_pack8to4.h  | 17 ++++++++--------
 src/layer/x86/deconvolution_x86.cpp     |  7 +++++++
 22 files changed, 203 insertions(+), 152 deletions(-)

diff --git a/src/layer/convolutiondepthwise.cpp b/src/layer/convolutiondepthwise.cpp
index 955fc0676..e820a192c 100644
--- a/src/layer/convolutiondepthwise.cpp
+++ b/src/layer/convolutiondepthwise.cpp
@@ -245,6 +245,10 @@ static int convolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const Mat
                 float* outptr = top_blob.channel(g * outch_g + p);
                 const float* weight_data_ptr = (const float*)weight_data + maxk * inch_g * outch_g * g;
 
+                // shadowed variable for less openmp task args
+                const int outw = top_blob.w;
+                const int outh = top_blob.h;
+
                 for (int i = 0; i < outh; i++)
                 {
                     for (int j = 0; j < outw; j++)
diff --git a/src/layer/convolutiondepthwise3d.cpp b/src/layer/convolutiondepthwise3d.cpp
index e90baec7f..ba4e46c7a 100644
--- a/src/layer/convolutiondepthwise3d.cpp
+++ b/src/layer/convolutiondepthwise3d.cpp
@@ -181,6 +181,11 @@ int ConvolutionDepthWise3D::forward(const Mat& bottom_blob, Mat& top_blob, const
                 float* outptr = top_blob.channel(g * num_output_g + p);
                 const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g;
 
+                // shadowed variable for less openmp task args
+                const int outw = top_blob.w;
+                const int outh = top_blob.h;
+                const int outd = top_blob.d;
+
                 for (int z = 0; z < outd; z++)
                 {
                     for (int i = 0; i < outh; i++)
diff --git a/src/layer/deconvolution.cpp b/src/layer/deconvolution.cpp
index 88a79f1ab..411395f96 100644
--- a/src/layer/deconvolution.cpp
+++ b/src/layer/deconvolution.cpp
@@ -67,16 +67,9 @@ int Deconvolution::load_model(const ModelBin& mb)
 
 static int deconvolution(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    const int w = bottom_blob.w;
-    const int h = bottom_blob.h;
-    const int inch = bottom_blob.c;
-
     const int outw = top_blob.w;
-    const int outh = top_blob.h;
     const int outch = top_blob.c;
 
-    const int bias_term = bias_data.empty() ? 0 : 1;
-
     const int maxk = kernel_w * kernel_h;
 
     // kernel offsets
@@ -103,10 +96,17 @@ static int deconvolution(const Mat& bottom_blob, Mat& top_blob, const Mat& weigh
     {
         Mat out = top_blob.channel(p);
 
-        const float bias = bias_term ? bias_data[p] : 0.f;
+        const float bias = bias_data.empty() ? 0.f : bias_data[p];
 
         out.fill(bias);
 
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int inch = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
         for (int i = 0; i < h; i++)
         {
             for (int j = 0; j < w; j++)
diff --git a/src/layer/deconvolution3d.cpp b/src/layer/deconvolution3d.cpp
index 9850d4255..45bc3690a 100644
--- a/src/layer/deconvolution3d.cpp
+++ b/src/layer/deconvolution3d.cpp
@@ -74,18 +74,10 @@ int Deconvolution3D::load_model(const ModelBin& mb)
 
 static int deconvolution3d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int kernel_d, int stride_w, int stride_h, int stride_d, int dilation_w, int dilation_h, int dilation_d, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    const int w = bottom_blob.w;
-    const int h = bottom_blob.h;
-    const int d = bottom_blob.d;
-    const int inch = bottom_blob.c;
-
     const int outw = top_blob.w;
     const int outh = top_blob.h;
-    const int outd = top_blob.d;
     const int outch = top_blob.c;
 
-    const int bias_term = bias_data.empty() ? 0 : 1;
-
     const int maxk = kernel_w * kernel_h * kernel_d;
 
     // kernel offsets
@@ -117,10 +109,19 @@ static int deconvolution3d(const Mat& bottom_blob, Mat& top_blob, const Mat& wei
     {
         Mat out = top_blob.channel(p);
 
-        const float bias = bias_term ? bias_data[p] : 0.f;
+        const float bias = bias_data.empty() ? 0.f : bias_data[p];
 
         out.fill(bias);
 
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int d = bottom_blob.d;
+        const int inch = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+        const int outd = top_blob.d;
+
         for (int z = 0; z < d; z++)
         {
             for (int i = 0; i < h; i++)
diff --git a/src/layer/deconvolutiondepthwise.cpp b/src/layer/deconvolutiondepthwise.cpp
index 12f9dca75..cd0ef36ea 100644
--- a/src/layer/deconvolutiondepthwise.cpp
+++ b/src/layer/deconvolutiondepthwise.cpp
@@ -68,16 +68,11 @@ int DeconvolutionDepthWise::load_model(const ModelBin& mb)
 
 static int deconvolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, int group, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    const int w = bottom_blob.w;
-    const int h = bottom_blob.h;
     const int inch = bottom_blob.c;
 
     const int outw = top_blob.w;
-    const int outh = top_blob.h;
     const int outch = top_blob.c;
 
-    const int bias_term = bias_data.empty() ? 0 : 1;
-
     const int maxk = kernel_w * kernel_h;
 
     // kernel offsets
@@ -109,10 +104,16 @@ static int deconvolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const M
             const float* kptr = (const float*)weight_data + maxk * g;
             Mat out = top_blob.channel(g);
 
-            const float bias = bias_term ? bias_data[g] : 0.f;
+            const float bias = bias_data.empty() ? 0.f : bias_data[g];
 
             out.fill(bias);
 
+            // shadowed variable for less openmp task args
+            const int w = bottom_blob.w;
+            const int h = bottom_blob.h;
+            const int outw = top_blob.w;
+            const int outh = top_blob.h;
+
             for (int i = 0; i < h; i++)
             {
                 for (int j = 0; j < w; j++)
@@ -157,10 +158,17 @@ static int deconvolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const M
                 Mat out = top_blob.channel(g * outch_g + p);
 
                 const float* weight_data_ptr = (const float*)weight_data + maxk * inch_g * outch_g * g;
-                const float bias = bias_term ? bias_data[g * outch_g + p] : 0.f;
+
+                const float bias = bias_data.empty() ? 0.f : bias_data[g * outch_g + p];
 
                 out.fill(bias);
 
+                // shadowed variable for less openmp task args
+                const int w = bottom_blob.w;
+                const int h = bottom_blob.h;
+                const int outw = top_blob.w;
+                const int outh = top_blob.h;
+
                 for (int i = 0; i < h; i++)
                 {
                     for (int j = 0; j < w; j++)
diff --git a/src/layer/deconvolutiondepthwise3d.cpp b/src/layer/deconvolutiondepthwise3d.cpp
index 9ddee4142..1f96dd013 100644
--- a/src/layer/deconvolutiondepthwise3d.cpp
+++ b/src/layer/deconvolutiondepthwise3d.cpp
@@ -75,18 +75,12 @@ int DeconvolutionDepthWise3D::load_model(const ModelBin& mb)
 
 static int deconvolutiondepthwise3d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int kernel_d, int stride_w, int stride_h, int stride_d, int dilation_w, int dilation_h, int dilation_d, int group, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    const int w = bottom_blob.w;
-    const int h = bottom_blob.h;
-    const int d = bottom_blob.d;
     const int inch = bottom_blob.c;
 
     const int outw = top_blob.w;
     const int outh = top_blob.h;
-    const int outd = top_blob.d;
     const int outch = top_blob.c;
 
-    const int bias_term = bias_data.empty() ? 0 : 1;
-
     const int maxk = kernel_w * kernel_h * kernel_d;
 
     // kernel offsets
@@ -123,10 +117,18 @@ static int deconvolutiondepthwise3d(const Mat& bottom_blob, Mat& top_blob, const
             const float* kptr = (const float*)weight_data + maxk * g;
             Mat out = top_blob.channel(g);
 
-            const float bias = bias_term ? bias_data[g] : 0.f;
+            const float bias = bias_data.empty() ? 0.f : bias_data[g];
 
             out.fill(bias);
 
+            // shadowed variable for less openmp task args
+            const int w = bottom_blob.w;
+            const int h = bottom_blob.h;
+            const int d = bottom_blob.d;
+            const int outw = top_blob.w;
+            const int outh = top_blob.h;
+            const int outd = top_blob.d;
+
             for (int z = 0; z < d; z++)
             {
                 for (int i = 0; i < h; i++)
@@ -174,10 +176,19 @@ static int deconvolutiondepthwise3d(const Mat& bottom_blob, Mat& top_blob, const
                 Mat out = top_blob.channel(g * outch_g + p);
 
                 const float* weight_data_ptr = (const float*)weight_data + maxk * inch_g * outch_g * g;
-                const float bias = bias_term ? bias_data[g * outch_g + p] : 0.f;
+
+                const float bias = bias_data.empty() ? 0.f : bias_data[g * outch_g + p];
 
                 out.fill(bias);
 
+                // shadowed variable for less openmp task args
+                const int w = bottom_blob.w;
+                const int h = bottom_blob.h;
+                const int d = bottom_blob.d;
+                const int outw = top_blob.w;
+                const int outh = top_blob.h;
+                const int outd = top_blob.d;
+
                 for (int z = 0; z < d; z++)
                 {
                     for (int i = 0; i < h; i++)
diff --git a/src/layer/x86/deconvolution_pack16.h b/src/layer/x86/deconvolution_pack16.h
index 177f0f409..8f21a3938 100644
--- a/src/layer/x86/deconvolution_pack16.h
+++ b/src/layer/x86/deconvolution_pack16.h
@@ -14,19 +14,11 @@
 
 static void deconvolution_pack16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
     int outch = top_blob.c;
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
 
-    const int maxk = kernel_w * kernel_h;
-
     const float* bias_data_ptr = bias_data;
 
     // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack16_avx512(const Mat& bottom_blob, Mat& top_blob, c
     {
         float* outptr = top_blob.channel(p);
 
+        const int maxk = kernel_w * kernel_h;
+
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int channels = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
         for (int i = 0; i < outh; i++)
         {
             for (int j = 0; j < outw; j++)
diff --git a/src/layer/x86/deconvolution_pack16to1.h b/src/layer/x86/deconvolution_pack16to1.h
index 91ab274e9..65a1cfae0 100644
--- a/src/layer/x86/deconvolution_pack16to1.h
+++ b/src/layer/x86/deconvolution_pack16to1.h
@@ -14,19 +14,11 @@
 
 static void deconvolution_pack16to1_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
     int outch = top_blob.c;
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
 
-    const int maxk = kernel_w * kernel_h;
-
     const float* bias_data_ptr = bias_data;
 
     // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack16to1_avx512(const Mat& bottom_blob, Mat& top_blob
     {
         float* outptr = top_blob.channel(p);
 
+        const int maxk = kernel_w * kernel_h;
+
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int channels = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
         for (int i = 0; i < outh; i++)
         {
             for (int j = 0; j < outw; j++)
diff --git a/src/layer/x86/deconvolution_pack16to4.h b/src/layer/x86/deconvolution_pack16to4.h
index e96d04549..ce4f7fd37 100644
--- a/src/layer/x86/deconvolution_pack16to4.h
+++ b/src/layer/x86/deconvolution_pack16to4.h
@@ -14,19 +14,11 @@
 
 static void deconvolution_pack16to4_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
     int outch = top_blob.c;
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
 
-    const int maxk = kernel_w * kernel_h;
-
     const float* bias_data_ptr = bias_data;
 
     // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack16to4_avx512(const Mat& bottom_blob, Mat& top_blob
     {
         float* outptr = top_blob.channel(p);
 
+        const int maxk = kernel_w * kernel_h;
+
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int channels = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
         for (int i = 0; i < outh; i++)
         {
             for (int j = 0; j < outw; j++)
diff --git a/src/layer/x86/deconvolution_pack16to8.h b/src/layer/x86/deconvolution_pack16to8.h
index a7aad0e28..0001f7c30 100644
--- a/src/layer/x86/deconvolution_pack16to8.h
+++ b/src/layer/x86/deconvolution_pack16to8.h
@@ -14,19 +14,11 @@
 
 static void deconvolution_pack16to8_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
     int outch = top_blob.c;
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
 
-    const int maxk = kernel_w * kernel_h;
-
     const float* bias_data_ptr = bias_data;
 
     // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack16to8_avx512(const Mat& bottom_blob, Mat& top_blob
     {
         float* outptr = top_blob.channel(p);
 
+        const int maxk = kernel_w * kernel_h;
+
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int channels = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
         for (int i = 0; i < outh; i++)
         {
             for (int j = 0; j < outw; j++)
diff --git a/src/layer/x86/deconvolution_pack1to16.h b/src/layer/x86/deconvolution_pack1to16.h
index ef9865a69..c357e1f0a 100644
--- a/src/layer/x86/deconvolution_pack1to16.h
+++ b/src/layer/x86/deconvolution_pack1to16.h
@@ -14,19 +14,11 @@
 
 static void deconvolution_pack1to16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
     int outch = top_blob.c;
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
 
-    const int maxk = kernel_w * kernel_h;
-
     const float* bias_data_ptr = bias_data;
 
     // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack1to16_avx512(const Mat& bottom_blob, Mat& top_blob
     {
         float* outptr = top_blob.channel(p);
 
+        const int maxk = kernel_w * kernel_h;
+
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int channels = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
         for (int i = 0; i < outh; i++)
         {
             for (int j = 0; j < outw; j++)
diff --git a/src/layer/x86/deconvolution_pack1to4.h b/src/layer/x86/deconvolution_pack1to4.h
index 4bb9379e6..e749f1f91 100644
--- a/src/layer/x86/deconvolution_pack1to4.h
+++ b/src/layer/x86/deconvolution_pack1to4.h
@@ -14,19 +14,11 @@
 
 static void deconvolution_pack1to4_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
     int outch = top_blob.c;
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
 
-    const int maxk = kernel_w * kernel_h;
-
     const float* bias_data_ptr = bias_data;
 
     // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack1to4_sse(const Mat& bottom_blob, Mat& top_blob, co
     {
         float* outptr = top_blob.channel(p);
 
+        const int maxk = kernel_w * kernel_h;
+
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int channels = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
         for (int i = 0; i < outh; i++)
         {
             for (int j = 0; j < outw; j++)
diff --git a/src/layer/x86/deconvolution_pack1to8.h b/src/layer/x86/deconvolution_pack1to8.h
index 960018693..31b7a6729 100644
--- a/src/layer/x86/deconvolution_pack1to8.h
+++ b/src/layer/x86/deconvolution_pack1to8.h
@@ -14,19 +14,11 @@
 
 static void deconvolution_pack1to8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
     int outch = top_blob.c;
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
 
-    const int maxk = kernel_w * kernel_h;
-
     const float* bias_data_ptr = bias_data;
 
     // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack1to8_avx(const Mat& bottom_blob, Mat& top_blob, co
     {
         float* outptr = top_blob.channel(p);
 
+        const int maxk = kernel_w * kernel_h;
+
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int channels = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
         for (int i = 0; i < outh; i++)
         {
             for (int j = 0; j < outw; j++)
diff --git a/src/layer/x86/deconvolution_pack4.h b/src/layer/x86/deconvolution_pack4.h
index 6eccca30f..96955e209 100644
--- a/src/layer/x86/deconvolution_pack4.h
+++ b/src/layer/x86/deconvolution_pack4.h
@@ -14,19 +14,11 @@
 
 static void deconvolution_pack4_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
     int outch = top_blob.c;
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
 
-    const int maxk = kernel_w * kernel_h;
-
     const float* bias_data_ptr = bias_data;
 
     // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack4_sse(const Mat& bottom_blob, Mat& top_blob, const
     {
         float* outptr = top_blob.channel(p);
 
+        const int maxk = kernel_w * kernel_h;
+
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int channels = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
         for (int i = 0; i < outh; i++)
         {
             for (int j = 0; j < outw; j++)
diff --git a/src/layer/x86/deconvolution_pack4to1.h b/src/layer/x86/deconvolution_pack4to1.h
index 2b1a7675c..27f890437 100644
--- a/src/layer/x86/deconvolution_pack4to1.h
+++ b/src/layer/x86/deconvolution_pack4to1.h
@@ -14,19 +14,11 @@
 
 static void deconvolution_pack4to1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
     int outch = top_blob.c;
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
 
-    const int maxk = kernel_w * kernel_h;
-
     const float* bias_data_ptr = bias_data;
 
     // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack4to1_sse(const Mat& bottom_blob, Mat& top_blob, co
     {
         float* outptr = top_blob.channel(p);
 
+        const int maxk = kernel_w * kernel_h;
+
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int channels = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
         for (int i = 0; i < outh; i++)
         {
             for (int j = 0; j < outw; j++)
diff --git a/src/layer/x86/deconvolution_pack4to16.h b/src/layer/x86/deconvolution_pack4to16.h
index c1930b66a..4510c28a4 100644
--- a/src/layer/x86/deconvolution_pack4to16.h
+++ b/src/layer/x86/deconvolution_pack4to16.h
@@ -14,19 +14,11 @@
 
 static void deconvolution_pack4to16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
     int outch = top_blob.c;
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
 
-    const int maxk = kernel_w * kernel_h;
-
     const float* bias_data_ptr = bias_data;
 
     // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack4to16_avx512(const Mat& bottom_blob, Mat& top_blob
     {
         float* outptr = top_blob.channel(p);
 
+        const int maxk = kernel_w * kernel_h;
+
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int channels = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
         for (int i = 0; i < outh; i++)
         {
             for (int j = 0; j < outw; j++)
diff --git a/src/layer/x86/deconvolution_pack4to8.h b/src/layer/x86/deconvolution_pack4to8.h
index 82ca0e6ee..1ca095275 100644
--- a/src/layer/x86/deconvolution_pack4to8.h
+++ b/src/layer/x86/deconvolution_pack4to8.h
@@ -14,19 +14,11 @@
 
 static void deconvolution_pack4to8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
     int outch = top_blob.c;
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
 
-    const int maxk = kernel_w * kernel_h;
-
     const float* bias_data_ptr = bias_data;
 
     // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack4to8_avx(const Mat& bottom_blob, Mat& top_blob, co
     {
         float* outptr = top_blob.channel(p);
 
+        const int maxk = kernel_w * kernel_h;
+
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int channels = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
         for (int i = 0; i < outh; i++)
         {
             for (int j = 0; j < outw; j++)
diff --git a/src/layer/x86/deconvolution_pack8.h b/src/layer/x86/deconvolution_pack8.h
index e74c6660e..c0ec798ef 100644
--- a/src/layer/x86/deconvolution_pack8.h
+++ b/src/layer/x86/deconvolution_pack8.h
@@ -14,19 +14,11 @@
 
 static void deconvolution_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
     int outch = top_blob.c;
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
 
-    const int maxk = kernel_w * kernel_h;
-
     const float* bias_data_ptr = bias_data;
 
     // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const
     {
         float* outptr = top_blob.channel(p);
 
+        const int maxk = kernel_w * kernel_h;
+
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int channels = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
         for (int i = 0; i < outh; i++)
         {
             for (int j = 0; j < outw; j++)
diff --git a/src/layer/x86/deconvolution_pack8to1.h b/src/layer/x86/deconvolution_pack8to1.h
index 4c639e0f8..4d40bb425 100644
--- a/src/layer/x86/deconvolution_pack8to1.h
+++ b/src/layer/x86/deconvolution_pack8to1.h
@@ -14,19 +14,11 @@
 
 static void deconvolution_pack8to1_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
     int outch = top_blob.c;
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
 
-    const int maxk = kernel_w * kernel_h;
-
     const float* bias_data_ptr = bias_data;
 
     // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack8to1_avx(const Mat& bottom_blob, Mat& top_blob, co
     {
         float* outptr = top_blob.channel(p);
 
+        const int maxk = kernel_w * kernel_h;
+
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int channels = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
         for (int i = 0; i < outh; i++)
         {
             for (int j = 0; j < outw; j++)
diff --git a/src/layer/x86/deconvolution_pack8to16.h b/src/layer/x86/deconvolution_pack8to16.h
index 88693dd23..86f0d726b 100644
--- a/src/layer/x86/deconvolution_pack8to16.h
+++ b/src/layer/x86/deconvolution_pack8to16.h
@@ -14,19 +14,11 @@
 
 static void deconvolution_pack8to16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
     int outch = top_blob.c;
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
 
-    const int maxk = kernel_w * kernel_h;
-
     const float* bias_data_ptr = bias_data;
 
     // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack8to16_avx512(const Mat& bottom_blob, Mat& top_blob
     {
         float* outptr = top_blob.channel(p);
 
+        const int maxk = kernel_w * kernel_h;
+
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int channels = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
         for (int i = 0; i < outh; i++)
         {
             for (int j = 0; j < outw; j++)
diff --git a/src/layer/x86/deconvolution_pack8to4.h b/src/layer/x86/deconvolution_pack8to4.h
index 87cacfd18..fea7a87b6 100644
--- a/src/layer/x86/deconvolution_pack8to4.h
+++ b/src/layer/x86/deconvolution_pack8to4.h
@@ -14,19 +14,11 @@
 
 static void deconvolution_pack8to4_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
     int outch = top_blob.c;
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
 
-    const int maxk = kernel_w * kernel_h;
-
     const float* bias_data_ptr = bias_data;
 
     // num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack8to4_avx(const Mat& bottom_blob, Mat& top_blob, co
     {
         float* outptr = top_blob.channel(p);
 
+        const int maxk = kernel_w * kernel_h;
+
+        // shadowed variable for less openmp task args
+        const int w = bottom_blob.w;
+        const int h = bottom_blob.h;
+        const int channels = bottom_blob.c;
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+
         for (int i = 0; i < outh; i++)
         {
             for (int j = 0; j < outw; j++)
diff --git a/src/layer/x86/deconvolution_x86.cpp b/src/layer/x86/deconvolution_x86.cpp
index b253fa2e3..6d7c3aaaa 100644
--- a/src/layer/x86/deconvolution_x86.cpp
+++ b/src/layer/x86/deconvolution_x86.cpp
@@ -311,6 +311,13 @@ int Deconvolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
             {
                 float* outptr = top_blob_bordered.channel(p);
 
+                // shadowed variable for less openmp task args
+                const int w = bottom_blob.w;
+                const int h = bottom_blob.h;
+                const int channels = bottom_blob.c;
+                const int outw = top_blob_bordered.w;
+                const int outh = top_blob_bordered.h;
+
                 for (int i = 0; i < outh; i++)
                 {
                     for (int j = 0; j < outw; j++)