From f1ea792b264da8d3e0fd7ac02cf45d3053c9020d Mon Sep 17 00:00:00 2001 From: nihui Date: Wed, 13 Jul 2022 10:08:30 +0800 Subject: [PATCH] fix too many microtask error in old libomp runtime (#4002) --- src/layer/convolutiondepthwise.cpp | 4 ++++ src/layer/convolutiondepthwise3d.cpp | 5 +++++ src/layer/deconvolution.cpp | 16 +++++++-------- src/layer/deconvolution3d.cpp | 19 ++++++++--------- src/layer/deconvolutiondepthwise.cpp | 22 +++++++++++++------- src/layer/deconvolutiondepthwise3d.cpp | 27 +++++++++++++++++-------- src/layer/x86/deconvolution_pack16.h | 17 ++++++++-------- src/layer/x86/deconvolution_pack16to1.h | 17 ++++++++-------- src/layer/x86/deconvolution_pack16to4.h | 17 ++++++++-------- src/layer/x86/deconvolution_pack16to8.h | 17 ++++++++-------- src/layer/x86/deconvolution_pack1to16.h | 17 ++++++++-------- src/layer/x86/deconvolution_pack1to4.h | 17 ++++++++-------- src/layer/x86/deconvolution_pack1to8.h | 17 ++++++++-------- src/layer/x86/deconvolution_pack4.h | 17 ++++++++-------- src/layer/x86/deconvolution_pack4to1.h | 17 ++++++++-------- src/layer/x86/deconvolution_pack4to16.h | 17 ++++++++-------- src/layer/x86/deconvolution_pack4to8.h | 17 ++++++++-------- src/layer/x86/deconvolution_pack8.h | 17 ++++++++-------- src/layer/x86/deconvolution_pack8to1.h | 17 ++++++++-------- src/layer/x86/deconvolution_pack8to16.h | 17 ++++++++-------- src/layer/x86/deconvolution_pack8to4.h | 17 ++++++++-------- src/layer/x86/deconvolution_x86.cpp | 7 +++++++ 22 files changed, 203 insertions(+), 152 deletions(-) diff --git a/src/layer/convolutiondepthwise.cpp b/src/layer/convolutiondepthwise.cpp index 955fc0676..e820a192c 100644 --- a/src/layer/convolutiondepthwise.cpp +++ b/src/layer/convolutiondepthwise.cpp @@ -245,6 +245,10 @@ static int convolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const Mat float* outptr = top_blob.channel(g * outch_g + p); const float* weight_data_ptr = (const float*)weight_data + maxk * inch_g * outch_g * g; + // shadowed variable for less openmp task args + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) diff --git a/src/layer/convolutiondepthwise3d.cpp b/src/layer/convolutiondepthwise3d.cpp index e90baec7f..ba4e46c7a 100644 --- a/src/layer/convolutiondepthwise3d.cpp +++ b/src/layer/convolutiondepthwise3d.cpp @@ -181,6 +181,11 @@ int ConvolutionDepthWise3D::forward(const Mat& bottom_blob, Mat& top_blob, const float* outptr = top_blob.channel(g * num_output_g + p); const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g; + // shadowed variable for less openmp task args + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outd = top_blob.d; + for (int z = 0; z < outd; z++) { for (int i = 0; i < outh; i++) diff --git a/src/layer/deconvolution.cpp b/src/layer/deconvolution.cpp index 88a79f1ab..411395f96 100644 --- a/src/layer/deconvolution.cpp +++ b/src/layer/deconvolution.cpp @@ -67,16 +67,9 @@ int Deconvolution::load_model(const ModelBin& mb) static int deconvolution(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, int activation_type, const Mat& activation_params, const Option& opt) { - const int w = bottom_blob.w; - const int h = bottom_blob.h; - const int inch = bottom_blob.c; - const int outw = top_blob.w; - const int outh = top_blob.h; const int outch = top_blob.c; - const int bias_term = bias_data.empty() ? 0 : 1; - const int maxk = kernel_w * kernel_h; // kernel offsets @@ -103,10 +96,17 @@ static int deconvolution(const Mat& bottom_blob, Mat& top_blob, const Mat& weigh { Mat out = top_blob.channel(p); - const float bias = bias_term ? bias_data[p] : 0.f; + const float bias = bias_data.empty() ? 0.f : bias_data[p]; out.fill(bias); + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) diff --git a/src/layer/deconvolution3d.cpp b/src/layer/deconvolution3d.cpp index 9850d4255..45bc3690a 100644 --- a/src/layer/deconvolution3d.cpp +++ b/src/layer/deconvolution3d.cpp @@ -74,18 +74,10 @@ int Deconvolution3D::load_model(const ModelBin& mb) static int deconvolution3d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int kernel_d, int stride_w, int stride_h, int stride_d, int dilation_w, int dilation_h, int dilation_d, int activation_type, const Mat& activation_params, const Option& opt) { - const int w = bottom_blob.w; - const int h = bottom_blob.h; - const int d = bottom_blob.d; - const int inch = bottom_blob.c; - const int outw = top_blob.w; const int outh = top_blob.h; - const int outd = top_blob.d; const int outch = top_blob.c; - const int bias_term = bias_data.empty() ? 0 : 1; - const int maxk = kernel_w * kernel_h * kernel_d; // kernel offsets @@ -117,10 +109,19 @@ static int deconvolution3d(const Mat& bottom_blob, Mat& top_blob, const Mat& wei { Mat out = top_blob.channel(p); - const float bias = bias_term ? bias_data[p] : 0.f; + const float bias = bias_data.empty() ? 0.f : bias_data[p]; out.fill(bias); + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int d = bottom_blob.d; + const int inch = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outd = top_blob.d; + for (int z = 0; z < d; z++) { for (int i = 0; i < h; i++) diff --git a/src/layer/deconvolutiondepthwise.cpp b/src/layer/deconvolutiondepthwise.cpp index 12f9dca75..cd0ef36ea 100644 --- a/src/layer/deconvolutiondepthwise.cpp +++ b/src/layer/deconvolutiondepthwise.cpp @@ -68,16 +68,11 @@ int DeconvolutionDepthWise::load_model(const ModelBin& mb) static int deconvolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, int group, int activation_type, const Mat& activation_params, const Option& opt) { - const int w = bottom_blob.w; - const int h = bottom_blob.h; const int inch = bottom_blob.c; const int outw = top_blob.w; - const int outh = top_blob.h; const int outch = top_blob.c; - const int bias_term = bias_data.empty() ? 0 : 1; - const int maxk = kernel_w * kernel_h; // kernel offsets @@ -109,10 +104,16 @@ static int deconvolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const M const float* kptr = (const float*)weight_data + maxk * g; Mat out = top_blob.channel(g); - const float bias = bias_term ? bias_data[g] : 0.f; + const float bias = bias_data.empty() ? 0.f : bias_data[g]; out.fill(bias); + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) @@ -157,10 +158,17 @@ static int deconvolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const M Mat out = top_blob.channel(g * outch_g + p); const float* weight_data_ptr = (const float*)weight_data + maxk * inch_g * outch_g * g; - const float bias = bias_term ? bias_data[g * outch_g + p] : 0.f; + + const float bias = bias_data.empty() ? 0.f : bias_data[g * outch_g + p]; out.fill(bias); + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) diff --git a/src/layer/deconvolutiondepthwise3d.cpp b/src/layer/deconvolutiondepthwise3d.cpp index 9ddee4142..1f96dd013 100644 --- a/src/layer/deconvolutiondepthwise3d.cpp +++ b/src/layer/deconvolutiondepthwise3d.cpp @@ -75,18 +75,12 @@ int DeconvolutionDepthWise3D::load_model(const ModelBin& mb) static int deconvolutiondepthwise3d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int kernel_d, int stride_w, int stride_h, int stride_d, int dilation_w, int dilation_h, int dilation_d, int group, int activation_type, const Mat& activation_params, const Option& opt) { - const int w = bottom_blob.w; - const int h = bottom_blob.h; - const int d = bottom_blob.d; const int inch = bottom_blob.c; const int outw = top_blob.w; const int outh = top_blob.h; - const int outd = top_blob.d; const int outch = top_blob.c; - const int bias_term = bias_data.empty() ? 0 : 1; - const int maxk = kernel_w * kernel_h * kernel_d; // kernel offsets @@ -123,10 +117,18 @@ static int deconvolutiondepthwise3d(const Mat& bottom_blob, Mat& top_blob, const const float* kptr = (const float*)weight_data + maxk * g; Mat out = top_blob.channel(g); - const float bias = bias_term ? bias_data[g] : 0.f; + const float bias = bias_data.empty() ? 0.f : bias_data[g]; out.fill(bias); + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int d = bottom_blob.d; + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outd = top_blob.d; + for (int z = 0; z < d; z++) { for (int i = 0; i < h; i++) @@ -174,10 +176,19 @@ static int deconvolutiondepthwise3d(const Mat& bottom_blob, Mat& top_blob, const Mat out = top_blob.channel(g * outch_g + p); const float* weight_data_ptr = (const float*)weight_data + maxk * inch_g * outch_g * g; - const float bias = bias_term ? bias_data[g * outch_g + p] : 0.f; + + const float bias = bias_data.empty() ? 0.f : bias_data[g * outch_g + p]; out.fill(bias); + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int d = bottom_blob.d; + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outd = top_blob.d; + for (int z = 0; z < d; z++) { for (int i = 0; i < h; i++) diff --git a/src/layer/x86/deconvolution_pack16.h b/src/layer/x86/deconvolution_pack16.h index 177f0f409..8f21a3938 100644 --- a/src/layer/x86/deconvolution_pack16.h +++ b/src/layer/x86/deconvolution_pack16.h @@ -14,19 +14,11 @@ static void deconvolution_pack16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; int outch = top_blob.c; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int maxk = kernel_w * kernel_h; - const float* bias_data_ptr = bias_data; // num_output @@ -35,6 +27,15 @@ static void deconvolution_pack16_avx512(const Mat& bottom_blob, Mat& top_blob, c { float* outptr = top_blob.channel(p); + const int maxk = kernel_w * kernel_h; + + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) diff --git a/src/layer/x86/deconvolution_pack16to1.h b/src/layer/x86/deconvolution_pack16to1.h index 91ab274e9..65a1cfae0 100644 --- a/src/layer/x86/deconvolution_pack16to1.h +++ b/src/layer/x86/deconvolution_pack16to1.h @@ -14,19 +14,11 @@ static void deconvolution_pack16to1_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; int outch = top_blob.c; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int maxk = kernel_w * kernel_h; - const float* bias_data_ptr = bias_data; // num_output @@ -35,6 +27,15 @@ static void deconvolution_pack16to1_avx512(const Mat& bottom_blob, Mat& top_blob { float* outptr = top_blob.channel(p); + const int maxk = kernel_w * kernel_h; + + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) diff --git a/src/layer/x86/deconvolution_pack16to4.h b/src/layer/x86/deconvolution_pack16to4.h index e96d04549..ce4f7fd37 100644 --- a/src/layer/x86/deconvolution_pack16to4.h +++ b/src/layer/x86/deconvolution_pack16to4.h @@ -14,19 +14,11 @@ static void deconvolution_pack16to4_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; int outch = top_blob.c; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int maxk = kernel_w * kernel_h; - const float* bias_data_ptr = bias_data; // num_output @@ -35,6 +27,15 @@ static void deconvolution_pack16to4_avx512(const Mat& bottom_blob, Mat& top_blob { float* outptr = top_blob.channel(p); + const int maxk = kernel_w * kernel_h; + + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) diff --git a/src/layer/x86/deconvolution_pack16to8.h b/src/layer/x86/deconvolution_pack16to8.h index a7aad0e28..0001f7c30 100644 --- a/src/layer/x86/deconvolution_pack16to8.h +++ b/src/layer/x86/deconvolution_pack16to8.h @@ -14,19 +14,11 @@ static void deconvolution_pack16to8_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; int outch = top_blob.c; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int maxk = kernel_w * kernel_h; - const float* bias_data_ptr = bias_data; // num_output @@ -35,6 +27,15 @@ static void deconvolution_pack16to8_avx512(const Mat& bottom_blob, Mat& top_blob { float* outptr = top_blob.channel(p); + const int maxk = kernel_w * kernel_h; + + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) diff --git a/src/layer/x86/deconvolution_pack1to16.h b/src/layer/x86/deconvolution_pack1to16.h index ef9865a69..c357e1f0a 100644 --- a/src/layer/x86/deconvolution_pack1to16.h +++ b/src/layer/x86/deconvolution_pack1to16.h @@ -14,19 +14,11 @@ static void deconvolution_pack1to16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; int outch = top_blob.c; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int maxk = kernel_w * kernel_h; - const float* bias_data_ptr = bias_data; // num_output @@ -35,6 +27,15 @@ static void deconvolution_pack1to16_avx512(const Mat& bottom_blob, Mat& top_blob { float* outptr = top_blob.channel(p); + const int maxk = kernel_w * kernel_h; + + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) diff --git a/src/layer/x86/deconvolution_pack1to4.h b/src/layer/x86/deconvolution_pack1to4.h index 4bb9379e6..e749f1f91 100644 --- a/src/layer/x86/deconvolution_pack1to4.h +++ b/src/layer/x86/deconvolution_pack1to4.h @@ -14,19 +14,11 @@ static void deconvolution_pack1to4_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; int outch = top_blob.c; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int maxk = kernel_w * kernel_h; - const float* bias_data_ptr = bias_data; // num_output @@ -35,6 +27,15 @@ static void deconvolution_pack1to4_sse(const Mat& bottom_blob, Mat& top_blob, co { float* outptr = top_blob.channel(p); + const int maxk = kernel_w * kernel_h; + + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) diff --git a/src/layer/x86/deconvolution_pack1to8.h b/src/layer/x86/deconvolution_pack1to8.h index 960018693..31b7a6729 100644 --- a/src/layer/x86/deconvolution_pack1to8.h +++ b/src/layer/x86/deconvolution_pack1to8.h @@ -14,19 +14,11 @@ static void deconvolution_pack1to8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; int outch = top_blob.c; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int maxk = kernel_w * kernel_h; - const float* bias_data_ptr = bias_data; // num_output @@ -35,6 +27,15 @@ static void deconvolution_pack1to8_avx(const Mat& bottom_blob, Mat& top_blob, co { float* outptr = top_blob.channel(p); + const int maxk = kernel_w * kernel_h; + + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) diff --git a/src/layer/x86/deconvolution_pack4.h b/src/layer/x86/deconvolution_pack4.h index 6eccca30f..96955e209 100644 --- a/src/layer/x86/deconvolution_pack4.h +++ b/src/layer/x86/deconvolution_pack4.h @@ -14,19 +14,11 @@ static void deconvolution_pack4_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; int outch = top_blob.c; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int maxk = kernel_w * kernel_h; - const float* bias_data_ptr = bias_data; // num_output @@ -35,6 +27,15 @@ static void deconvolution_pack4_sse(const Mat& bottom_blob, Mat& top_blob, const { float* outptr = top_blob.channel(p); + const int maxk = kernel_w * kernel_h; + + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) diff --git a/src/layer/x86/deconvolution_pack4to1.h b/src/layer/x86/deconvolution_pack4to1.h index 2b1a7675c..27f890437 100644 --- a/src/layer/x86/deconvolution_pack4to1.h +++ b/src/layer/x86/deconvolution_pack4to1.h @@ -14,19 +14,11 @@ static void deconvolution_pack4to1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; int outch = top_blob.c; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int maxk = kernel_w * kernel_h; - const float* bias_data_ptr = bias_data; // num_output @@ -35,6 +27,15 @@ static void deconvolution_pack4to1_sse(const Mat& bottom_blob, Mat& top_blob, co { float* outptr = top_blob.channel(p); + const int maxk = kernel_w * kernel_h; + + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) diff --git a/src/layer/x86/deconvolution_pack4to16.h b/src/layer/x86/deconvolution_pack4to16.h index c1930b66a..4510c28a4 100644 --- a/src/layer/x86/deconvolution_pack4to16.h +++ b/src/layer/x86/deconvolution_pack4to16.h @@ -14,19 +14,11 @@ static void deconvolution_pack4to16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; int outch = top_blob.c; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int maxk = kernel_w * kernel_h; - const float* bias_data_ptr = bias_data; // num_output @@ -35,6 +27,15 @@ static void deconvolution_pack4to16_avx512(const Mat& bottom_blob, Mat& top_blob { float* outptr = top_blob.channel(p); + const int maxk = kernel_w * kernel_h; + + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) diff --git a/src/layer/x86/deconvolution_pack4to8.h b/src/layer/x86/deconvolution_pack4to8.h index 82ca0e6ee..1ca095275 100644 --- a/src/layer/x86/deconvolution_pack4to8.h +++ b/src/layer/x86/deconvolution_pack4to8.h @@ -14,19 +14,11 @@ static void deconvolution_pack4to8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; int outch = top_blob.c; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int maxk = kernel_w * kernel_h; - const float* bias_data_ptr = bias_data; // num_output @@ -35,6 +27,15 @@ static void deconvolution_pack4to8_avx(const Mat& bottom_blob, Mat& top_blob, co { float* outptr = top_blob.channel(p); + const int maxk = kernel_w * kernel_h; + + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) diff --git a/src/layer/x86/deconvolution_pack8.h b/src/layer/x86/deconvolution_pack8.h index e74c6660e..c0ec798ef 100644 --- a/src/layer/x86/deconvolution_pack8.h +++ b/src/layer/x86/deconvolution_pack8.h @@ -14,19 +14,11 @@ static void deconvolution_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; int outch = top_blob.c; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int maxk = kernel_w * kernel_h; - const float* bias_data_ptr = bias_data; // num_output @@ -35,6 +27,15 @@ static void deconvolution_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const { float* outptr = top_blob.channel(p); + const int maxk = kernel_w * kernel_h; + + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) diff --git a/src/layer/x86/deconvolution_pack8to1.h b/src/layer/x86/deconvolution_pack8to1.h index 4c639e0f8..4d40bb425 100644 --- a/src/layer/x86/deconvolution_pack8to1.h +++ b/src/layer/x86/deconvolution_pack8to1.h @@ -14,19 +14,11 @@ static void deconvolution_pack8to1_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; int outch = top_blob.c; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int maxk = kernel_w * kernel_h; - const float* bias_data_ptr = bias_data; // num_output @@ -35,6 +27,15 @@ static void deconvolution_pack8to1_avx(const Mat& bottom_blob, Mat& top_blob, co { float* outptr = top_blob.channel(p); + const int maxk = kernel_w * kernel_h; + + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) diff --git a/src/layer/x86/deconvolution_pack8to16.h b/src/layer/x86/deconvolution_pack8to16.h index 88693dd23..86f0d726b 100644 --- a/src/layer/x86/deconvolution_pack8to16.h +++ b/src/layer/x86/deconvolution_pack8to16.h @@ -14,19 +14,11 @@ static void deconvolution_pack8to16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; int outch = top_blob.c; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int maxk = kernel_w * kernel_h; - const float* bias_data_ptr = bias_data; // num_output @@ -35,6 +27,15 @@ static void deconvolution_pack8to16_avx512(const Mat& bottom_blob, Mat& top_blob { float* outptr = top_blob.channel(p); + const int maxk = kernel_w * kernel_h; + + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) diff --git a/src/layer/x86/deconvolution_pack8to4.h b/src/layer/x86/deconvolution_pack8to4.h index 87cacfd18..fea7a87b6 100644 --- a/src/layer/x86/deconvolution_pack8to4.h +++ b/src/layer/x86/deconvolution_pack8to4.h @@ -14,19 +14,11 @@ static void deconvolution_pack8to4_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; int outch = top_blob.c; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int maxk = kernel_w * kernel_h; - const float* bias_data_ptr = bias_data; // num_output @@ -35,6 +27,15 @@ static void deconvolution_pack8to4_avx(const Mat& bottom_blob, Mat& top_blob, co { float* outptr = top_blob.channel(p); + const int maxk = kernel_w * kernel_h; + + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int outw = top_blob.w; + const int outh = top_blob.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) diff --git a/src/layer/x86/deconvolution_x86.cpp b/src/layer/x86/deconvolution_x86.cpp index b253fa2e3..6d7c3aaaa 100644 --- a/src/layer/x86/deconvolution_x86.cpp +++ b/src/layer/x86/deconvolution_x86.cpp @@ -311,6 +311,13 @@ int Deconvolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Opti { float* outptr = top_blob_bordered.channel(p); + // shadowed variable for less openmp task args + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int outw = top_blob_bordered.w; + const int outh = top_blob_bordered.h; + for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++)