Browse Source

fix too many microtask error in old libomp runtime (#4002)

tags/20220721
nihui GitHub 3 years ago
parent
commit
f1ea792b26
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 203 additions and 152 deletions
  1. +4
    -0
      src/layer/convolutiondepthwise.cpp
  2. +5
    -0
      src/layer/convolutiondepthwise3d.cpp
  3. +8
    -8
      src/layer/deconvolution.cpp
  4. +10
    -9
      src/layer/deconvolution3d.cpp
  5. +15
    -7
      src/layer/deconvolutiondepthwise.cpp
  6. +19
    -8
      src/layer/deconvolutiondepthwise3d.cpp
  7. +9
    -8
      src/layer/x86/deconvolution_pack16.h
  8. +9
    -8
      src/layer/x86/deconvolution_pack16to1.h
  9. +9
    -8
      src/layer/x86/deconvolution_pack16to4.h
  10. +9
    -8
      src/layer/x86/deconvolution_pack16to8.h
  11. +9
    -8
      src/layer/x86/deconvolution_pack1to16.h
  12. +9
    -8
      src/layer/x86/deconvolution_pack1to4.h
  13. +9
    -8
      src/layer/x86/deconvolution_pack1to8.h
  14. +9
    -8
      src/layer/x86/deconvolution_pack4.h
  15. +9
    -8
      src/layer/x86/deconvolution_pack4to1.h
  16. +9
    -8
      src/layer/x86/deconvolution_pack4to16.h
  17. +9
    -8
      src/layer/x86/deconvolution_pack4to8.h
  18. +9
    -8
      src/layer/x86/deconvolution_pack8.h
  19. +9
    -8
      src/layer/x86/deconvolution_pack8to1.h
  20. +9
    -8
      src/layer/x86/deconvolution_pack8to16.h
  21. +9
    -8
      src/layer/x86/deconvolution_pack8to4.h
  22. +7
    -0
      src/layer/x86/deconvolution_x86.cpp

+ 4
- 0
src/layer/convolutiondepthwise.cpp View File

@@ -245,6 +245,10 @@ static int convolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const Mat
float* outptr = top_blob.channel(g * outch_g + p);
const float* weight_data_ptr = (const float*)weight_data + maxk * inch_g * outch_g * g;

// shadowed variable for less openmp task args
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


+ 5
- 0
src/layer/convolutiondepthwise3d.cpp View File

@@ -181,6 +181,11 @@ int ConvolutionDepthWise3D::forward(const Mat& bottom_blob, Mat& top_blob, const
float* outptr = top_blob.channel(g * num_output_g + p);
const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g;

// shadowed variable for less openmp task args
const int outw = top_blob.w;
const int outh = top_blob.h;
const int outd = top_blob.d;

for (int z = 0; z < outd; z++)
{
for (int i = 0; i < outh; i++)


+ 8
- 8
src/layer/deconvolution.cpp View File

@@ -67,16 +67,9 @@ int Deconvolution::load_model(const ModelBin& mb)

static int deconvolution(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, int activation_type, const Mat& activation_params, const Option& opt)
{
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int inch = bottom_blob.c;

const int outw = top_blob.w;
const int outh = top_blob.h;
const int outch = top_blob.c;

const int bias_term = bias_data.empty() ? 0 : 1;

const int maxk = kernel_w * kernel_h;

// kernel offsets
@@ -103,10 +96,17 @@ static int deconvolution(const Mat& bottom_blob, Mat& top_blob, const Mat& weigh
{
Mat out = top_blob.channel(p);

const float bias = bias_term ? bias_data[p] : 0.f;
const float bias = bias_data.empty() ? 0.f : bias_data[p];

out.fill(bias);

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int inch = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < h; i++)
{
for (int j = 0; j < w; j++)


+ 10
- 9
src/layer/deconvolution3d.cpp View File

@@ -74,18 +74,10 @@ int Deconvolution3D::load_model(const ModelBin& mb)

static int deconvolution3d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int kernel_d, int stride_w, int stride_h, int stride_d, int dilation_w, int dilation_h, int dilation_d, int activation_type, const Mat& activation_params, const Option& opt)
{
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int d = bottom_blob.d;
const int inch = bottom_blob.c;

const int outw = top_blob.w;
const int outh = top_blob.h;
const int outd = top_blob.d;
const int outch = top_blob.c;

const int bias_term = bias_data.empty() ? 0 : 1;

const int maxk = kernel_w * kernel_h * kernel_d;

// kernel offsets
@@ -117,10 +109,19 @@ static int deconvolution3d(const Mat& bottom_blob, Mat& top_blob, const Mat& wei
{
Mat out = top_blob.channel(p);

const float bias = bias_term ? bias_data[p] : 0.f;
const float bias = bias_data.empty() ? 0.f : bias_data[p];

out.fill(bias);

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int d = bottom_blob.d;
const int inch = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;
const int outd = top_blob.d;

for (int z = 0; z < d; z++)
{
for (int i = 0; i < h; i++)


+ 15
- 7
src/layer/deconvolutiondepthwise.cpp View File

@@ -68,16 +68,11 @@ int DeconvolutionDepthWise::load_model(const ModelBin& mb)

static int deconvolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, int group, int activation_type, const Mat& activation_params, const Option& opt)
{
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int inch = bottom_blob.c;

const int outw = top_blob.w;
const int outh = top_blob.h;
const int outch = top_blob.c;

const int bias_term = bias_data.empty() ? 0 : 1;

const int maxk = kernel_w * kernel_h;

// kernel offsets
@@ -109,10 +104,16 @@ static int deconvolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const M
const float* kptr = (const float*)weight_data + maxk * g;
Mat out = top_blob.channel(g);

const float bias = bias_term ? bias_data[g] : 0.f;
const float bias = bias_data.empty() ? 0.f : bias_data[g];

out.fill(bias);

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < h; i++)
{
for (int j = 0; j < w; j++)
@@ -157,10 +158,17 @@ static int deconvolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const M
Mat out = top_blob.channel(g * outch_g + p);

const float* weight_data_ptr = (const float*)weight_data + maxk * inch_g * outch_g * g;
const float bias = bias_term ? bias_data[g * outch_g + p] : 0.f;

const float bias = bias_data.empty() ? 0.f : bias_data[g * outch_g + p];

out.fill(bias);

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < h; i++)
{
for (int j = 0; j < w; j++)


+ 19
- 8
src/layer/deconvolutiondepthwise3d.cpp View File

@@ -75,18 +75,12 @@ int DeconvolutionDepthWise3D::load_model(const ModelBin& mb)

static int deconvolutiondepthwise3d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int kernel_d, int stride_w, int stride_h, int stride_d, int dilation_w, int dilation_h, int dilation_d, int group, int activation_type, const Mat& activation_params, const Option& opt)
{
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int d = bottom_blob.d;
const int inch = bottom_blob.c;

const int outw = top_blob.w;
const int outh = top_blob.h;
const int outd = top_blob.d;
const int outch = top_blob.c;

const int bias_term = bias_data.empty() ? 0 : 1;

const int maxk = kernel_w * kernel_h * kernel_d;

// kernel offsets
@@ -123,10 +117,18 @@ static int deconvolutiondepthwise3d(const Mat& bottom_blob, Mat& top_blob, const
const float* kptr = (const float*)weight_data + maxk * g;
Mat out = top_blob.channel(g);

const float bias = bias_term ? bias_data[g] : 0.f;
const float bias = bias_data.empty() ? 0.f : bias_data[g];

out.fill(bias);

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int d = bottom_blob.d;
const int outw = top_blob.w;
const int outh = top_blob.h;
const int outd = top_blob.d;

for (int z = 0; z < d; z++)
{
for (int i = 0; i < h; i++)
@@ -174,10 +176,19 @@ static int deconvolutiondepthwise3d(const Mat& bottom_blob, Mat& top_blob, const
Mat out = top_blob.channel(g * outch_g + p);

const float* weight_data_ptr = (const float*)weight_data + maxk * inch_g * outch_g * g;
const float bias = bias_term ? bias_data[g * outch_g + p] : 0.f;

const float bias = bias_data.empty() ? 0.f : bias_data[g * outch_g + p];

out.fill(bias);

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int d = bottom_blob.d;
const int outw = top_blob.w;
const int outh = top_blob.h;
const int outd = top_blob.d;

for (int z = 0; z < d; z++)
{
for (int i = 0; i < h; i++)


+ 9
- 8
src/layer/x86/deconvolution_pack16.h View File

@@ -14,19 +14,11 @@

static void deconvolution_pack16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

const int maxk = kernel_w * kernel_h;

const float* bias_data_ptr = bias_data;

// num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack16_avx512(const Mat& bottom_blob, Mat& top_blob, c
{
float* outptr = top_blob.channel(p);

const int maxk = kernel_w * kernel_h;

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


+ 9
- 8
src/layer/x86/deconvolution_pack16to1.h View File

@@ -14,19 +14,11 @@

static void deconvolution_pack16to1_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

const int maxk = kernel_w * kernel_h;

const float* bias_data_ptr = bias_data;

// num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack16to1_avx512(const Mat& bottom_blob, Mat& top_blob
{
float* outptr = top_blob.channel(p);

const int maxk = kernel_w * kernel_h;

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


+ 9
- 8
src/layer/x86/deconvolution_pack16to4.h View File

@@ -14,19 +14,11 @@

static void deconvolution_pack16to4_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

const int maxk = kernel_w * kernel_h;

const float* bias_data_ptr = bias_data;

// num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack16to4_avx512(const Mat& bottom_blob, Mat& top_blob
{
float* outptr = top_blob.channel(p);

const int maxk = kernel_w * kernel_h;

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


+ 9
- 8
src/layer/x86/deconvolution_pack16to8.h View File

@@ -14,19 +14,11 @@

static void deconvolution_pack16to8_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

const int maxk = kernel_w * kernel_h;

const float* bias_data_ptr = bias_data;

// num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack16to8_avx512(const Mat& bottom_blob, Mat& top_blob
{
float* outptr = top_blob.channel(p);

const int maxk = kernel_w * kernel_h;

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


+ 9
- 8
src/layer/x86/deconvolution_pack1to16.h View File

@@ -14,19 +14,11 @@

static void deconvolution_pack1to16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

const int maxk = kernel_w * kernel_h;

const float* bias_data_ptr = bias_data;

// num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack1to16_avx512(const Mat& bottom_blob, Mat& top_blob
{
float* outptr = top_blob.channel(p);

const int maxk = kernel_w * kernel_h;

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


+ 9
- 8
src/layer/x86/deconvolution_pack1to4.h View File

@@ -14,19 +14,11 @@

static void deconvolution_pack1to4_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

const int maxk = kernel_w * kernel_h;

const float* bias_data_ptr = bias_data;

// num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack1to4_sse(const Mat& bottom_blob, Mat& top_blob, co
{
float* outptr = top_blob.channel(p);

const int maxk = kernel_w * kernel_h;

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


+ 9
- 8
src/layer/x86/deconvolution_pack1to8.h View File

@@ -14,19 +14,11 @@

static void deconvolution_pack1to8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

const int maxk = kernel_w * kernel_h;

const float* bias_data_ptr = bias_data;

// num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack1to8_avx(const Mat& bottom_blob, Mat& top_blob, co
{
float* outptr = top_blob.channel(p);

const int maxk = kernel_w * kernel_h;

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


+ 9
- 8
src/layer/x86/deconvolution_pack4.h View File

@@ -14,19 +14,11 @@

static void deconvolution_pack4_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

const int maxk = kernel_w * kernel_h;

const float* bias_data_ptr = bias_data;

// num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack4_sse(const Mat& bottom_blob, Mat& top_blob, const
{
float* outptr = top_blob.channel(p);

const int maxk = kernel_w * kernel_h;

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


+ 9
- 8
src/layer/x86/deconvolution_pack4to1.h View File

@@ -14,19 +14,11 @@

static void deconvolution_pack4to1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

const int maxk = kernel_w * kernel_h;

const float* bias_data_ptr = bias_data;

// num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack4to1_sse(const Mat& bottom_blob, Mat& top_blob, co
{
float* outptr = top_blob.channel(p);

const int maxk = kernel_w * kernel_h;

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


+ 9
- 8
src/layer/x86/deconvolution_pack4to16.h View File

@@ -14,19 +14,11 @@

static void deconvolution_pack4to16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

const int maxk = kernel_w * kernel_h;

const float* bias_data_ptr = bias_data;

// num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack4to16_avx512(const Mat& bottom_blob, Mat& top_blob
{
float* outptr = top_blob.channel(p);

const int maxk = kernel_w * kernel_h;

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


+ 9
- 8
src/layer/x86/deconvolution_pack4to8.h View File

@@ -14,19 +14,11 @@

static void deconvolution_pack4to8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

const int maxk = kernel_w * kernel_h;

const float* bias_data_ptr = bias_data;

// num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack4to8_avx(const Mat& bottom_blob, Mat& top_blob, co
{
float* outptr = top_blob.channel(p);

const int maxk = kernel_w * kernel_h;

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


+ 9
- 8
src/layer/x86/deconvolution_pack8.h View File

@@ -14,19 +14,11 @@

static void deconvolution_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

const int maxk = kernel_w * kernel_h;

const float* bias_data_ptr = bias_data;

// num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const
{
float* outptr = top_blob.channel(p);

const int maxk = kernel_w * kernel_h;

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


+ 9
- 8
src/layer/x86/deconvolution_pack8to1.h View File

@@ -14,19 +14,11 @@

static void deconvolution_pack8to1_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

const int maxk = kernel_w * kernel_h;

const float* bias_data_ptr = bias_data;

// num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack8to1_avx(const Mat& bottom_blob, Mat& top_blob, co
{
float* outptr = top_blob.channel(p);

const int maxk = kernel_w * kernel_h;

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


+ 9
- 8
src/layer/x86/deconvolution_pack8to16.h View File

@@ -14,19 +14,11 @@

static void deconvolution_pack8to16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

const int maxk = kernel_w * kernel_h;

const float* bias_data_ptr = bias_data;

// num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack8to16_avx512(const Mat& bottom_blob, Mat& top_blob
{
float* outptr = top_blob.channel(p);

const int maxk = kernel_w * kernel_h;

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


+ 9
- 8
src/layer/x86/deconvolution_pack8to4.h View File

@@ -14,19 +14,11 @@

static void deconvolution_pack8to4_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

int outw = top_blob.w;
int outh = top_blob.h;
int outch = top_blob.c;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

const int maxk = kernel_w * kernel_h;

const float* bias_data_ptr = bias_data;

// num_output
@@ -35,6 +27,15 @@ static void deconvolution_pack8to4_avx(const Mat& bottom_blob, Mat& top_blob, co
{
float* outptr = top_blob.channel(p);

const int maxk = kernel_w * kernel_h;

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int outw = top_blob.w;
const int outh = top_blob.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


+ 7
- 0
src/layer/x86/deconvolution_x86.cpp View File

@@ -311,6 +311,13 @@ int Deconvolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
{
float* outptr = top_blob_bordered.channel(p);

// shadowed variable for less openmp task args
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int outw = top_blob_bordered.w;
const int outh = top_blob_bordered.h;

for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)


Loading…
Cancel
Save