Browse Source

fuse sigmoid

tags/20190611
nihui 7 years ago
parent
commit
3e003ffd98
46 changed files with 454 additions and 63 deletions
  1. +7
    -0
      src/layer/arm/convolution_arm.cpp
  2. +7
    -0
      src/layer/arm/convolutiondepthwise_arm.cpp
  3. +7
    -0
      src/layer/arm/deconvolution_arm.cpp
  4. +7
    -0
      src/layer/arm/deconvolutiondepthwise_arm.cpp
  5. +4
    -0
      src/layer/arm/innerproduct_arm.cpp
  6. +4
    -0
      src/layer/convolution.cpp
  7. +1
    -1
      src/layer/convolution.h
  8. +8
    -0
      src/layer/convolutiondepthwise.cpp
  9. +1
    -1
      src/layer/convolutiondepthwise.h
  10. +10
    -0
      src/layer/deconvolution.cpp
  11. +1
    -1
      src/layer/deconvolution.h
  12. +20
    -0
      src/layer/deconvolutiondepthwise.cpp
  13. +1
    -1
      src/layer/deconvolutiondepthwise.h
  14. +4
    -0
      src/layer/innerproduct.cpp
  15. +1
    -1
      src/layer/innerproduct.h
  16. +4
    -0
      src/layer/vulkan/shader/convolution.comp
  17. +4
    -0
      src/layer/vulkan/shader/convolution_1x1s1d1.comp
  18. +23
    -0
      src/layer/vulkan/shader/convolution_pack1to4.comp
  19. +4
    -0
      src/layer/vulkan/shader/convolution_pack4.comp
  20. +18
    -12
      src/layer/vulkan/shader/convolution_pack4_3x3s1d1_lds_8_8_2.comp
  21. +4
    -0
      src/layer/vulkan/shader/convolution_pack4to1.comp
  22. +4
    -0
      src/layer/vulkan/shader/convolutiondepthwise.comp
  23. +4
    -0
      src/layer/vulkan/shader/convolutiondepthwise_group.comp
  24. +4
    -0
      src/layer/vulkan/shader/convolutiondepthwise_group_pack1to4.comp
  25. +4
    -0
      src/layer/vulkan/shader/convolutiondepthwise_group_pack4.comp
  26. +4
    -0
      src/layer/vulkan/shader/convolutiondepthwise_group_pack4to1.comp
  27. +4
    -0
      src/layer/vulkan/shader/convolutiondepthwise_pack4.comp
  28. +4
    -0
      src/layer/vulkan/shader/deconvolution.comp
  29. +4
    -0
      src/layer/vulkan/shader/deconvolution_pack1to4.comp
  30. +4
    -0
      src/layer/vulkan/shader/deconvolution_pack4.comp
  31. +4
    -0
      src/layer/vulkan/shader/deconvolution_pack4to1.comp
  32. +4
    -0
      src/layer/vulkan/shader/deconvolutiondepthwise.comp
  33. +4
    -0
      src/layer/vulkan/shader/deconvolutiondepthwise_group.comp
  34. +4
    -0
      src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to4.comp
  35. +4
    -0
      src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4.comp
  36. +4
    -0
      src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to1.comp
  37. +4
    -0
      src/layer/vulkan/shader/deconvolutiondepthwise_pack4.comp
  38. +4
    -0
      src/layer/vulkan/shader/innerproduct.comp
  39. +4
    -0
      src/layer/vulkan/shader/innerproduct_pack1to4.comp
  40. +4
    -0
      src/layer/vulkan/shader/innerproduct_pack4.comp
  41. +53
    -40
      src/layer/vulkan/shader/innerproduct_pack4_lds_64.comp
  42. +4
    -0
      src/layer/vulkan/shader/innerproduct_pack4to1.comp
  43. +7
    -0
      src/layer/x86/convolution_x86.cpp
  44. +7
    -0
      src/layer/x86/convolutiondepthwise_x86.cpp
  45. +1
    -1
      src/pipeline.cpp
  46. +166
    -5
      tools/ncnnoptimize.cpp

+ 7
- 0
src/layer/arm/convolution_arm.cpp View File

@@ -64,6 +64,13 @@ int Convolution_arm::create_pipeline(const Option& opt)
pd.set(1, activation_params[1]);// max
activation->load_param(pd);
}
else if (activation_type == 4)
{
activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);

ncnn::ParamDict pd;
activation->load_param(pd);
}

if (activation)
{


+ 7
- 0
src/layer/arm/convolutiondepthwise_arm.cpp View File

@@ -62,6 +62,13 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
pd.set(1, activation_params[1]);// max
activation->load_param(pd);
}
else if (activation_type == 4)
{
activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);

ncnn::ParamDict pd;
activation->load_param(pd);
}

if (activation)
{


+ 7
- 0
src/layer/arm/deconvolution_arm.cpp View File

@@ -53,6 +53,13 @@ int Deconvolution_arm::create_pipeline(const Option& opt)
pd.set(1, activation_params[1]);// max
activation->load_param(pd);
}
else if (activation_type == 4)
{
activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);

ncnn::ParamDict pd;
activation->load_param(pd);
}

if (activation)
{


+ 7
- 0
src/layer/arm/deconvolutiondepthwise_arm.cpp View File

@@ -50,6 +50,13 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
pd.set(1, activation_params[1]);// max
activation->load_param(pd);
}
else if (activation_type == 4)
{
activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);

ncnn::ParamDict pd;
activation->load_param(pd);
}

if (activation)
{


+ 4
- 0
src/layer/arm/innerproduct_arm.cpp View File

@@ -299,6 +299,10 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
if (sum > max)
sum = max;
}
else if (activation_type == 4)
{
sum = 1.f / (1.f + exp(-sum));
}

top_blob[p] = sum;
}


+ 4
- 0
src/layer/convolution.cpp View File

@@ -551,6 +551,10 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
if (sum > max)
sum = max;
}
else if (activation_type == 4)
{
sum = 1.f / (1.f + exp(-sum));
}

outptr[j] = sum;
}


+ 1
- 1
src/layer/convolution.h View File

@@ -52,7 +52,7 @@ public:

int int8_scale_term;

// 0=none 1=relu 2=leakyrelu 3=clip
// 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid
int activation_type;
Mat activation_params;



+ 8
- 0
src/layer/convolutiondepthwise.cpp View File

@@ -655,6 +655,10 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O
if (sum > max)
sum = max;
}
else if (activation_type == 4)
{
sum = 1.f / (1.f + exp(-sum));
}

outptr[j] = sum;
}
@@ -726,6 +730,10 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O
if (sum > max)
sum = max;
}
else if (activation_type == 4)
{
sum = 1.f / (1.f + exp(-sum));
}

outptr[j] = sum;
}


+ 1
- 1
src/layer/convolutiondepthwise.h View File

@@ -53,7 +53,7 @@ public:

int int8_scale_term;

// 0=none 1=relu 2=leakyrelu 3=clip
// 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid
int activation_type;
Mat activation_params;



+ 10
- 0
src/layer/deconvolution.cpp View File

@@ -186,6 +186,16 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
outptr[i] = max;
}
}
else if (activation_type == 4)
{
float* outptr = out;
int size = outw * outh;

for (int i = 0; i < size; i++)
{
outptr[i] = 1.f / (1.f + exp(-outptr[i]));
}
}
}

if (pad_w > 0 || pad_h > 0)


+ 1
- 1
src/layer/deconvolution.h View File

@@ -45,7 +45,7 @@ public:

int weight_data_size;

// 0=none 1=relu 2=leakyrelu 3=clip
// 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid
int activation_type;
Mat activation_params;



+ 20
- 0
src/layer/deconvolutiondepthwise.cpp View File

@@ -186,6 +186,16 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const
outptr[i] = max;
}
}
else if (activation_type == 4)
{
float* outptr = m;
int size = outw * outh;

for (int i = 0; i < size; i++)
{
outptr[i] = 1.f / (1.f + exp(-outptr[i]));
}
}
}
}
else
@@ -270,6 +280,16 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const
outptr[i] = max;
}
}
else if (activation_type == 4)
{
float* outptr = out;
int size = outw * outh;

for (int i = 0; i < size; i++)
{
outptr[i] = 1.f / (1.f + exp(-outptr[i]));
}
}
}
}
}


+ 1
- 1
src/layer/deconvolutiondepthwise.h View File

@@ -46,7 +46,7 @@ public:
int weight_data_size;
int group;

// 0=none 1=relu 2=leakyrelu 3=clip
// 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid
int activation_type;
Mat activation_params;



+ 4
- 0
src/layer/innerproduct.cpp View File

@@ -289,6 +289,10 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
if (sum > max)
sum = max;
}
else if (activation_type == 4)
{
sum = 1.f / (1.f + exp(-sum));
}

top_blob[p] = sum;
}


+ 1
- 1
src/layer/innerproduct.h View File

@@ -42,7 +42,7 @@ public:

int int8_scale_term;

// 0=none 1=relu 2=leakyrelu 3=clip
// 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid
int activation_type;
Mat activation_params;



+ 4
- 0
src/layer/vulkan/shader/convolution.comp View File

@@ -109,6 +109,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum);
}

+ 4
- 0
src/layer/vulkan/shader/convolution_1x1s1d1.comp View File

@@ -94,6 +94,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep_4 + gx] = sfpvec4(sum);
}

+ 23
- 0
src/layer/vulkan/shader/convolution_pack1to4.comp View File

@@ -28,6 +28,9 @@ layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int activation_type = 0;
layout (constant_id = 8) const float activation_param_0 = 0;
layout (constant_id = 9) const float activation_param_1 = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
@@ -95,5 +98,25 @@ void main()
}
}

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum);
}

+ 4
- 0
src/layer/vulkan/shader/convolution_pack4.comp View File

@@ -128,6 +128,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum);
}

+ 18
- 12
src/layer/vulkan/shader/convolution_pack4_3x3s1d1_lds_8_8_2.comp View File

@@ -88,20 +88,25 @@ void main()
int v_offset = gy * p.w + gx;
int w_offset = gz * p.c * 3 * 3;

int lv_offset = ly * (/*gl_WorkGroupSize.x*/8 + 2) + lx;
int lw_offset = lz * 3 * 3;

for (int z = 0; z < p.c; z++)
{
int lv_offset = ly * int(/*gl_WorkGroupSize.x*/8 + 2) + lx;
int lw_offset = lz * 3 * 3;

barrier();

// load v to local cache
if (lz == 0)
{
int v_offset_1 = v_offset + p.w + 1;
int lv_offset_1 = lv_offset + int(/*gl_WorkGroupSize.x*/8 + 2) + 1;
int lv_offset_1 = lv_offset + (/*gl_WorkGroupSize.x*/8 + 2) + 1;

lv[lv_offset_1] = afpvec4(bottom_blob_data[v_offset_1]);
}
else
{
int v_offset_1 = v_offset + p.w + 1;
int lv_offset_1 = lv_offset + (/*gl_WorkGroupSize.x*/8 + 2) + 1;

// left and right border
if (lx == 0)
@@ -170,20 +175,17 @@ void main()
{
for (int x = 0; x < 3; x++)
{
afpvec4 v = lv[lv_offset + x];
afpvec4 v = lv[lv_offset + y * (/*gl_WorkGroupSize.x*/8 + 2) + x];

afpmat4 k = afpmat4(
lk0[lw_offset + x],
lk1[lw_offset + x],
lk2[lw_offset + x],
lk3[lw_offset + x]
lk0[lw_offset + y * 3 + x],
lk1[lw_offset + y * 3 + x],
lk2[lw_offset + y * 3 + x],
lk3[lw_offset + y * 3 + x]
);

sum += v * k;
}

lv_offset += int(/*gl_WorkGroupSize.x*/8 + 2);
lw_offset += 3;
}

v_offset += p.cstep;
@@ -208,6 +210,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum);
}

+ 4
- 0
src/layer/vulkan/shader/convolution_pack4to1.comp View File

@@ -113,6 +113,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum);
}

+ 4
- 0
src/layer/vulkan/shader/convolutiondepthwise.comp View File

@@ -107,6 +107,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum);
}

+ 4
- 0
src/layer/vulkan/shader/convolutiondepthwise_group.comp View File

@@ -120,6 +120,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum);
}

+ 4
- 0
src/layer/vulkan/shader/convolutiondepthwise_group_pack1to4.comp View File

@@ -124,6 +124,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum);
}

+ 4
- 0
src/layer/vulkan/shader/convolutiondepthwise_group_pack4.comp View File

@@ -139,6 +139,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum);
}

+ 4
- 0
src/layer/vulkan/shader/convolutiondepthwise_group_pack4to1.comp View File

@@ -124,6 +124,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum);
}

+ 4
- 0
src/layer/vulkan/shader/convolutiondepthwise_pack4.comp View File

@@ -111,6 +111,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum);
}

+ 4
- 0
src/layer/vulkan/shader/deconvolution.comp View File

@@ -129,6 +129,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum);
}

+ 4
- 0
src/layer/vulkan/shader/deconvolution_pack1to4.comp View File

@@ -133,6 +133,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum);
}

+ 4
- 0
src/layer/vulkan/shader/deconvolution_pack4.comp View File

@@ -148,6 +148,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum);
}

+ 4
- 0
src/layer/vulkan/shader/deconvolution_pack4to1.comp View File

@@ -133,6 +133,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum);
}

+ 4
- 0
src/layer/vulkan/shader/deconvolutiondepthwise.comp View File

@@ -126,6 +126,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum);
}

+ 4
- 0
src/layer/vulkan/shader/deconvolutiondepthwise_group.comp View File

@@ -138,6 +138,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum);
}

+ 4
- 0
src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to4.comp View File

@@ -142,6 +142,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum);
}

+ 4
- 0
src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4.comp View File

@@ -157,6 +157,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum);
}

+ 4
- 0
src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to1.comp View File

@@ -142,6 +142,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum);
}

+ 4
- 0
src/layer/vulkan/shader/deconvolutiondepthwise_pack4.comp View File

@@ -130,6 +130,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum);
}

+ 4
- 0
src/layer/vulkan/shader/innerproduct.comp View File

@@ -92,6 +92,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gx] = sfp(sum);
}

+ 4
- 0
src/layer/vulkan/shader/innerproduct_pack1to4.comp View File

@@ -96,6 +96,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gx] = sfpvec4(sum);
}

+ 4
- 0
src/layer/vulkan/shader/innerproduct_pack4.comp View File

@@ -111,6 +111,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gx] = sfpvec4(sum);
}

+ 53
- 40
src/layer/vulkan/shader/innerproduct_pack4_lds_64.comp View File

@@ -64,9 +64,6 @@ void main()
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= 1 || gz >= 1)
return;

int lx = int(gl_LocalInvocationID.x);
int ly = int(gl_LocalInvocationID.y);
int lz = int(gl_LocalInvocationID.z);
@@ -90,94 +87,103 @@ void main()
barrier();

// load v to local cache
if (lz < 64)
if (lx < 64)
{
lv[lz] = afpvec4(bottom_blob_data[i + lz]);
lv[lx] = afpvec4(bottom_blob_data[i + lx]);
}

barrier();
memoryBarrierShared();

afpvec4 v = lv[lz];
for (int j=0; j<64; j++)
{
afpvec4 v = lv[j];

#if NCNN_fp16_storage && !NCNN_fp16_arithmetic
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
afpvec4(weight_data[(w_offset + i) * 4 + 0]),
afpvec4(weight_data[(w_offset + i) * 4 + 1]),
afpvec4(weight_data[(w_offset + i) * 4 + 2]),
afpvec4(weight_data[(w_offset + i) * 4 + 3])
);
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
afpvec4(weight_data[(w_offset + i + j) * 4 + 0]),
afpvec4(weight_data[(w_offset + i + j) * 4 + 1]),
afpvec4(weight_data[(w_offset + i + j) * 4 + 2]),
afpvec4(weight_data[(w_offset + i + j) * 4 + 3])
);
#else
afpmat4 k = afpmat4(weight_data[w_offset + i]);
afpmat4 k = afpmat4(weight_data[w_offset + i + j]);
#endif

sum += v * k;
sum += v * k;
}
}
for (; i+15 < p.w; i+=16)
{
barrier();

// load v to local cache
if (lz < 16)
if (lx < 16)
{
lv[lz] = afpvec4(bottom_blob_data[i + lz]);
lv[lx] = afpvec4(bottom_blob_data[i + lx]);
}

barrier();
memoryBarrierShared();

afpvec4 v = lv[lz];
for (int j=0; j<16; j++)
{
afpvec4 v = lv[j];

#if NCNN_fp16_storage && !NCNN_fp16_arithmetic
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
afpvec4(weight_data[(w_offset + i) * 4 + 0]),
afpvec4(weight_data[(w_offset + i) * 4 + 1]),
afpvec4(weight_data[(w_offset + i) * 4 + 2]),
afpvec4(weight_data[(w_offset + i) * 4 + 3])
);
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
afpvec4(weight_data[(w_offset + i + j) * 4 + 0]),
afpvec4(weight_data[(w_offset + i + j) * 4 + 1]),
afpvec4(weight_data[(w_offset + i + j) * 4 + 2]),
afpvec4(weight_data[(w_offset + i + j) * 4 + 3])
);
#else
afpmat4 k = afpmat4(weight_data[w_offset + i]);
afpmat4 k = afpmat4(weight_data[w_offset + i + j]);
#endif

sum += v * k;
sum += v * k;
}
}
for (; i+3 < p.w; i+=4)
{
barrier();

// load v to local cache
if (lz < 4)
if (lx < 4)
{
lv[lz] = afpvec4(bottom_blob_data[i + lz]);
lv[lx] = afpvec4(bottom_blob_data[i + lx]);
}

barrier();
memoryBarrierShared();

afpvec4 v = lv[lz];
for (int j=0; j<4; j++)
{
afpvec4 v = lv[j];

#if NCNN_fp16_storage && !NCNN_fp16_arithmetic
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
afpvec4(weight_data[(w_offset + i) * 4 + 0]),
afpvec4(weight_data[(w_offset + i) * 4 + 1]),
afpvec4(weight_data[(w_offset + i) * 4 + 2]),
afpvec4(weight_data[(w_offset + i) * 4 + 3])
);
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
afpvec4(weight_data[(w_offset + i + j) * 4 + 0]),
afpvec4(weight_data[(w_offset + i + j) * 4 + 1]),
afpvec4(weight_data[(w_offset + i + j) * 4 + 2]),
afpvec4(weight_data[(w_offset + i + j) * 4 + 3])
);
#else
afpmat4 k = afpmat4(weight_data[w_offset + i]);
afpmat4 k = afpmat4(weight_data[w_offset + i + j]);
#endif

sum += v * k;
sum += v * k;
}
}
for (; i < p.w; i++)
{
barrier();

// load v to local cache
if (lz == 0)
if (lx == 0)
{
lv[0] = afpvec4(bottom_blob_data[i]);
}
@@ -202,6 +208,9 @@ void main()
sum += v * k;
}

if (gx >= p.outw || gy >= 1 || gz >= 1)
return;

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
@@ -217,6 +226,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gx] = sfpvec4(sum);
}

+ 4
- 0
src/layer/vulkan/shader/innerproduct_pack4to1.comp View File

@@ -96,6 +96,10 @@ void main()
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}

top_blob_data[gx] = sfp(sum);
}

+ 7
- 0
src/layer/x86/convolution_x86.cpp View File

@@ -65,6 +65,13 @@ int Convolution_x86::create_pipeline(const Option& opt)
pd.set(1, activation_params[1]);// max
activation->load_param(pd);
}
else if (activation_type == 4)
{
activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);

ncnn::ParamDict pd;
activation->load_param(pd);
}

if (activation)
{


+ 7
- 0
src/layer/x86/convolutiondepthwise_x86.cpp View File

@@ -62,6 +62,13 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
pd.set(1, activation_params[1]);// max
activation->load_param(pd);
}
else if (activation_type == 4)
{
activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);

ncnn::ParamDict pd;
activation->load_param(pd);
}

if (activation)
{


+ 1
- 1
src/pipeline.cpp View File

@@ -45,7 +45,7 @@ int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const char*
{
local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size);

fprintf(stderr, "local_shader_module %p %s created\n", local_shader_module, entry_name);
// fprintf(stderr, "local_shader_module %p %s created\n", local_shader_module, entry_name);

create_descriptorset_layout(binding_count);



+ 166
- 5
tools/ncnnoptimize.cpp View File

@@ -91,8 +91,10 @@ public:

int eliminate_dropout();
int eliminate_flatten_after_global_pooling();
int eliminate_flatten_after_innerproduct();

int replace_convolution_with_innerproduct_after_global_pooling();
int replace_convolution_with_innerproduct_after_innerproduct();

public:
int fprintf_param_int_array(int id, const ncnn::Mat& m, FILE* pp);
@@ -662,7 +664,7 @@ int NetOptimize::fuse_convolution_activation()
int j = i + 1;
for (; j<layer_count; j++)
{
if (layers[j]->type != "ReLU" && layers[j]->type != "Clip")
if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
continue;

if (layers[j]->bottoms.size() != 1)
@@ -705,6 +707,10 @@ int NetOptimize::fuse_convolution_activation()
convolution->activation_params[0] = clip->min;
convolution->activation_params[1] = clip->max;
}
else if (activation->type == "Sigmoid")
{
convolution->activation_type = 4;
}

int top_blob_index_final = activation->tops[0];
convolution->tops[0] = top_blob_index_final;
@@ -729,7 +735,7 @@ int NetOptimize::fuse_convolutiondepthwise_activation()
int j = i + 1;
for (; j<layer_count; j++)
{
if (layers[j]->type != "ReLU" && layers[j]->type != "Clip")
if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
continue;

if (layers[j]->bottoms.size() != 1)
@@ -772,6 +778,10 @@ int NetOptimize::fuse_convolutiondepthwise_activation()
convolutiondepthwise->activation_params[0] = clip->min;
convolutiondepthwise->activation_params[1] = clip->max;
}
else if (activation->type == "Sigmoid")
{
convolutiondepthwise->activation_type = 4;
}

int top_blob_index_final = activation->tops[0];
convolutiondepthwise->tops[0] = top_blob_index_final;
@@ -796,7 +806,7 @@ int NetOptimize::fuse_deconvolution_activation()
int j = i + 1;
for (; j<layer_count; j++)
{
if (layers[j]->type != "ReLU" && layers[j]->type != "Clip")
if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
continue;

if (layers[j]->bottoms.size() != 1)
@@ -839,6 +849,10 @@ int NetOptimize::fuse_deconvolution_activation()
deconvolution->activation_params[0] = clip->min;
deconvolution->activation_params[1] = clip->max;
}
else if (activation->type == "Sigmoid")
{
deconvolution->activation_type = 4;
}

int top_blob_index_final = activation->tops[0];
deconvolution->tops[0] = top_blob_index_final;
@@ -863,7 +877,7 @@ int NetOptimize::fuse_deconvolutiondepthwise_activation()
int j = i + 1;
for (; j<layer_count; j++)
{
if (layers[j]->type != "ReLU" && layers[j]->type != "Clip")
if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
continue;

if (layers[j]->bottoms.size() != 1)
@@ -906,6 +920,10 @@ int NetOptimize::fuse_deconvolutiondepthwise_activation()
deconvolutiondepthwise->activation_params[0] = clip->min;
deconvolutiondepthwise->activation_params[1] = clip->max;
}
else if (activation->type == "Sigmoid")
{
deconvolutiondepthwise->activation_type = 4;
}

int top_blob_index_final = activation->tops[0];
deconvolutiondepthwise->tops[0] = top_blob_index_final;
@@ -930,7 +948,7 @@ int NetOptimize::fuse_innerproduct_activation()
int j = i + 1;
for (; j<layer_count; j++)
{
if (layers[j]->type != "ReLU" && layers[j]->type != "Clip")
if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
continue;

if (layers[j]->bottoms.size() != 1)
@@ -973,6 +991,10 @@ int NetOptimize::fuse_innerproduct_activation()
innerproduct->activation_params[0] = clip->min;
innerproduct->activation_params[1] = clip->max;
}
else if (activation->type == "Sigmoid")
{
innerproduct->activation_type = 4;
}

int top_blob_index_final = activation->tops[0];
innerproduct->tops[0] = top_blob_index_final;
@@ -1071,6 +1093,47 @@ int NetOptimize::eliminate_flatten_after_global_pooling()
return 0;
}

int NetOptimize::eliminate_flatten_after_innerproduct()
{
const int layer_count = layers.size();
for (int i=0; i<layer_count; i++)
{
if (layers[i]->type != "InnerProduct")
continue;

// InnerProduct - Flatten
int top_blob_index = layers[i]->tops[0];

int j = i + 1;
for (; j<layer_count; j++)
{
if (layers[j]->type != "Flatten")
continue;

if (layers[j]->bottoms.size() != 1)
continue;

if (layers[j]->bottoms[0] == top_blob_index)
break;
}

if (j == layer_count)
continue;

ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
ncnn::Flatten* flatten = (ncnn::Flatten*)layers[j];

fprintf(stderr, "eliminate_flatten_after_innerproduct %s %s\n", innerproduct->name.c_str(), flatten->name.c_str());

int top_blob_index_final = flatten->tops[0];
innerproduct->tops[0] = top_blob_index_final;
blobs[top_blob_index_final].producer = i;
flatten->type = "ncnnfused";
}

return 0;
}

int NetOptimize::replace_convolution_with_innerproduct_after_global_pooling()
{
const int layer_count = layers.size();
@@ -1123,6 +1186,9 @@ int NetOptimize::replace_convolution_with_innerproduct_after_global_pooling()
innerproduct->weight_data = convolution->weight_data;
innerproduct->bias_data = convolution->bias_data;

innerproduct->activation_type = convolution->activation_type;
innerproduct->activation_params = convolution->activation_params;

layers[j] = innerproduct;
delete convolution;
}
@@ -1130,6 +1196,75 @@ int NetOptimize::replace_convolution_with_innerproduct_after_global_pooling()
return 0;
}

int NetOptimize::replace_convolution_with_innerproduct_after_innerproduct()
{
const int layer_count = layers.size();
for (;;)
{
bool replaced = false;

for (int i=0; i<layer_count; i++)
{
if (layers[i]->type != "InnerProduct")
continue;

// InnerProduct - Convolution
int top_blob_index = layers[i]->tops[0];

int j = i + 1;
for (; j<layer_count; j++)
{
if (layers[j]->type != "Convolution")
continue;

if (layers[j]->bottoms.size() != 1)
continue;

if (layers[j]->bottoms[0] == top_blob_index)
break;
}

if (j == layer_count)
continue;

ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
ncnn::Convolution* convolution = (ncnn::Convolution*)layers[j];

fprintf(stderr, "replace_convolution_with_innerproduct_after_innerproduct %s %s\n", innerproduct->name.c_str(), convolution->name.c_str());

ncnn::InnerProduct* innerproduct2 = (ncnn::InnerProduct*)ncnn::create_layer("InnerProduct");

innerproduct2->type = "InnerProduct";
innerproduct2->name = convolution->name;
innerproduct2->bottoms = convolution->bottoms;
innerproduct2->tops = convolution->tops;

ncnn::ParamDict pd;
innerproduct2->load_param(pd);

innerproduct2->num_output = convolution->num_output;
innerproduct2->bias_term = convolution->bias_term;
innerproduct2->weight_data_size = convolution->weight_data_size;

innerproduct2->weight_data = convolution->weight_data;
innerproduct2->bias_data = convolution->bias_data;

innerproduct2->activation_type = convolution->activation_type;
innerproduct2->activation_params = convolution->activation_params;

layers[j] = innerproduct2;
delete convolution;

replaced = true;
}

if (!replaced)
break;
}

return 0;
}

int NetOptimize::fprintf_param_int_array(int id, const ncnn::Mat& m, FILE* pp)
{
const int count = m.w;
@@ -1158,8 +1293,15 @@ int NetOptimize::fprintf_param_float_array(int id, const ncnn::Mat& m, FILE* pp)
return 0;
}

static inline size_t alignSize(size_t sz, int n)
{
return (sz + n-1) & -n;
}

int NetOptimize::fwrite_weight_tag_data(int tag, const ncnn::Mat& data, FILE* bp)
{
int p0 = ftell(bp);

ncnn::Mat data_flattened = data.reshape(data.w * data.h * data.c);
if (storage_type == 1 && tag == 0)
{
@@ -1174,13 +1316,29 @@ int NetOptimize::fwrite_weight_tag_data(int tag, const ncnn::Mat& data, FILE* bp
fwrite(&tag, sizeof(int), 1, bp);
fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp);
}

// padding to 32bit align
int nwrite = ftell(bp) - p0;
int nalign = alignSize(nwrite, 4);
unsigned char padding[4] = {0x00, 0x00, 0x00, 0x00};
fwrite(padding, sizeof(unsigned char), nalign - nwrite, bp);

return 0;
}

int NetOptimize::fwrite_weight_data(const ncnn::Mat& data, FILE* bp)
{
int p0 = ftell(bp);

ncnn::Mat data_flattened = data.reshape(data.w * data.h * data.c);
fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp);

// padding to 32bit align
int nwrite = ftell(bp) - p0;
int nalign = alignSize(nwrite, 4);
unsigned char padding[4] = {0x00, 0x00, 0x00, 0x00};
fwrite(padding, sizeof(unsigned char), nalign - nwrite, bp);

return 0;
}

@@ -1835,6 +1993,9 @@ int main(int argc, char** argv)
optimizer.eliminate_flatten_after_global_pooling();

optimizer.replace_convolution_with_innerproduct_after_global_pooling();
optimizer.replace_convolution_with_innerproduct_after_innerproduct();

optimizer.eliminate_flatten_after_innerproduct();

optimizer.save(outparam, outbin);



Loading…
Cancel
Save