diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp index f202a4889..3eb8e635e 100644 --- a/src/layer/arm/convolution_arm.cpp +++ b/src/layer/arm/convolution_arm.cpp @@ -64,6 +64,13 @@ int Convolution_arm::create_pipeline(const Option& opt) pd.set(1, activation_params[1]);// max activation->load_param(pd); } + else if (activation_type == 4) + { + activation = ncnn::create_layer(ncnn::LayerType::Sigmoid); + + ncnn::ParamDict pd; + activation->load_param(pd); + } if (activation) { diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp index b844bc617..8e4e1486f 100644 --- a/src/layer/arm/convolutiondepthwise_arm.cpp +++ b/src/layer/arm/convolutiondepthwise_arm.cpp @@ -62,6 +62,13 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) pd.set(1, activation_params[1]);// max activation->load_param(pd); } + else if (activation_type == 4) + { + activation = ncnn::create_layer(ncnn::LayerType::Sigmoid); + + ncnn::ParamDict pd; + activation->load_param(pd); + } if (activation) { diff --git a/src/layer/arm/deconvolution_arm.cpp b/src/layer/arm/deconvolution_arm.cpp index 5c8e419bb..fcecc687e 100644 --- a/src/layer/arm/deconvolution_arm.cpp +++ b/src/layer/arm/deconvolution_arm.cpp @@ -53,6 +53,13 @@ int Deconvolution_arm::create_pipeline(const Option& opt) pd.set(1, activation_params[1]);// max activation->load_param(pd); } + else if (activation_type == 4) + { + activation = ncnn::create_layer(ncnn::LayerType::Sigmoid); + + ncnn::ParamDict pd; + activation->load_param(pd); + } if (activation) { diff --git a/src/layer/arm/deconvolutiondepthwise_arm.cpp b/src/layer/arm/deconvolutiondepthwise_arm.cpp index fec80d51f..a6e88821d 100644 --- a/src/layer/arm/deconvolutiondepthwise_arm.cpp +++ b/src/layer/arm/deconvolutiondepthwise_arm.cpp @@ -50,6 +50,13 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt) pd.set(1, activation_params[1]);// max activation->load_param(pd); } + else if (activation_type == 4) + { + activation = ncnn::create_layer(ncnn::LayerType::Sigmoid); + + ncnn::ParamDict pd; + activation->load_param(pd); + } if (activation) { diff --git a/src/layer/arm/innerproduct_arm.cpp b/src/layer/arm/innerproduct_arm.cpp index 86cd7caf3..d813b4f69 100644 --- a/src/layer/arm/innerproduct_arm.cpp +++ b/src/layer/arm/innerproduct_arm.cpp @@ -299,6 +299,10 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Optio if (sum > max) sum = max; } + else if (activation_type == 4) + { + sum = 1.f / (1.f + exp(-sum)); + } top_blob[p] = sum; } diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp index 3a2c6de9a..cac836b9c 100644 --- a/src/layer/convolution.cpp +++ b/src/layer/convolution.cpp @@ -551,6 +551,10 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op if (sum > max) sum = max; } + else if (activation_type == 4) + { + sum = 1.f / (1.f + exp(-sum)); + } outptr[j] = sum; } diff --git a/src/layer/convolution.h b/src/layer/convolution.h index b3cfd4ad6..9ac5ae4c4 100644 --- a/src/layer/convolution.h +++ b/src/layer/convolution.h @@ -52,7 +52,7 @@ public: int int8_scale_term; - // 0=none 1=relu 2=leakyrelu 3=clip + // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid int activation_type; Mat activation_params; diff --git a/src/layer/convolutiondepthwise.cpp b/src/layer/convolutiondepthwise.cpp index 9ba627d84..6ec4daef0 100644 --- a/src/layer/convolutiondepthwise.cpp +++ b/src/layer/convolutiondepthwise.cpp @@ -655,6 +655,10 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O if (sum > max) sum = max; } + else if (activation_type == 4) + { + sum = 1.f / (1.f + exp(-sum)); + } outptr[j] = sum; } @@ -726,6 +730,10 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O if (sum > max) sum = max; } + else if (activation_type == 4) + { + sum = 1.f / (1.f + exp(-sum)); + } outptr[j] = sum; } diff --git a/src/layer/convolutiondepthwise.h b/src/layer/convolutiondepthwise.h index e212d19dd..a61497e8a 100644 --- a/src/layer/convolutiondepthwise.h +++ b/src/layer/convolutiondepthwise.h @@ -53,7 +53,7 @@ public: int int8_scale_term; - // 0=none 1=relu 2=leakyrelu 3=clip + // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid int activation_type; Mat activation_params; diff --git a/src/layer/deconvolution.cpp b/src/layer/deconvolution.cpp index c7e65d9ed..a84815587 100644 --- a/src/layer/deconvolution.cpp +++ b/src/layer/deconvolution.cpp @@ -186,6 +186,16 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& outptr[i] = max; } } + else if (activation_type == 4) + { + float* outptr = out; + int size = outw * outh; + + for (int i = 0; i < size; i++) + { + outptr[i] = 1.f / (1.f + exp(-outptr[i])); + } + } } if (pad_w > 0 || pad_h > 0) diff --git a/src/layer/deconvolution.h b/src/layer/deconvolution.h index 05fe058d6..e4e873fbb 100644 --- a/src/layer/deconvolution.h +++ b/src/layer/deconvolution.h @@ -45,7 +45,7 @@ public: int weight_data_size; - // 0=none 1=relu 2=leakyrelu 3=clip + // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid int activation_type; Mat activation_params; diff --git a/src/layer/deconvolutiondepthwise.cpp b/src/layer/deconvolutiondepthwise.cpp index 1ac58b3de..1c3931499 100644 --- a/src/layer/deconvolutiondepthwise.cpp +++ b/src/layer/deconvolutiondepthwise.cpp @@ -186,6 +186,16 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const outptr[i] = max; } } + else if (activation_type == 4) + { + float* outptr = m; + int size = outw * outh; + + for (int i = 0; i < size; i++) + { + outptr[i] = 1.f / (1.f + exp(-outptr[i])); + } + } } } else @@ -270,6 +280,16 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const outptr[i] = max; } } + else if (activation_type == 4) + { + float* outptr = out; + int size = outw * outh; + + for (int i = 0; i < size; i++) + { + outptr[i] = 1.f / (1.f + exp(-outptr[i])); + } + } } } } diff --git a/src/layer/deconvolutiondepthwise.h b/src/layer/deconvolutiondepthwise.h index 315d504b7..db6782b0a 100644 --- a/src/layer/deconvolutiondepthwise.h +++ b/src/layer/deconvolutiondepthwise.h @@ -46,7 +46,7 @@ public: int weight_data_size; int group; - // 0=none 1=relu 2=leakyrelu 3=clip + // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid int activation_type; Mat activation_params; diff --git a/src/layer/innerproduct.cpp b/src/layer/innerproduct.cpp index f7dc7a1a6..979903d6d 100644 --- a/src/layer/innerproduct.cpp +++ b/src/layer/innerproduct.cpp @@ -289,6 +289,10 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o if (sum > max) sum = max; } + else if (activation_type == 4) + { + sum = 1.f / (1.f + exp(-sum)); + } top_blob[p] = sum; } diff --git a/src/layer/innerproduct.h b/src/layer/innerproduct.h index 76d4b551a..56ae53973 100644 --- a/src/layer/innerproduct.h +++ b/src/layer/innerproduct.h @@ -42,7 +42,7 @@ public: int int8_scale_term; - // 0=none 1=relu 2=leakyrelu 3=clip + // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid int activation_type; Mat activation_params; diff --git a/src/layer/vulkan/shader/convolution.comp b/src/layer/vulkan/shader/convolution.comp index b1bfe9878..72d03dab4 100644 --- a/src/layer/vulkan/shader/convolution.comp +++ b/src/layer/vulkan/shader/convolution.comp @@ -109,6 +109,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); } diff --git a/src/layer/vulkan/shader/convolution_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_1x1s1d1.comp index 88102d1a1..661af4a59 100644 --- a/src/layer/vulkan/shader/convolution_1x1s1d1.comp +++ b/src/layer/vulkan/shader/convolution_1x1s1d1.comp @@ -94,6 +94,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep_4 + gx] = sfpvec4(sum); } diff --git a/src/layer/vulkan/shader/convolution_pack1to4.comp b/src/layer/vulkan/shader/convolution_pack1to4.comp index 35243e7d6..cdecb692e 100644 --- a/src/layer/vulkan/shader/convolution_pack1to4.comp +++ b/src/layer/vulkan/shader/convolution_pack1to4.comp @@ -28,6 +28,9 @@ layout (constant_id = 3) const int dilation_h = 1; layout (constant_id = 4) const int stride_w = 1; layout (constant_id = 5) const int stride_h = 1; layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; @@ -95,5 +98,25 @@ void main() } } + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); } diff --git a/src/layer/vulkan/shader/convolution_pack4.comp b/src/layer/vulkan/shader/convolution_pack4.comp index d28340857..002241458 100644 --- a/src/layer/vulkan/shader/convolution_pack4.comp +++ b/src/layer/vulkan/shader/convolution_pack4.comp @@ -128,6 +128,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); } diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_lds_8_8_2.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_lds_8_8_2.comp index aec14defc..66fce0c76 100644 --- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_lds_8_8_2.comp +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_lds_8_8_2.comp @@ -88,20 +88,25 @@ void main() int v_offset = gy * p.w + gx; int w_offset = gz * p.c * 3 * 3; + int lv_offset = ly * (/*gl_WorkGroupSize.x*/8 + 2) + lx; + int lw_offset = lz * 3 * 3; + for (int z = 0; z < p.c; z++) { - int lv_offset = ly * int(/*gl_WorkGroupSize.x*/8 + 2) + lx; - int lw_offset = lz * 3 * 3; - barrier(); // load v to local cache if (lz == 0) { int v_offset_1 = v_offset + p.w + 1; - int lv_offset_1 = lv_offset + int(/*gl_WorkGroupSize.x*/8 + 2) + 1; + int lv_offset_1 = lv_offset + (/*gl_WorkGroupSize.x*/8 + 2) + 1; lv[lv_offset_1] = afpvec4(bottom_blob_data[v_offset_1]); + } + else + { + int v_offset_1 = v_offset + p.w + 1; + int lv_offset_1 = lv_offset + (/*gl_WorkGroupSize.x*/8 + 2) + 1; // left and right border if (lx == 0) @@ -170,20 +175,17 @@ void main() { for (int x = 0; x < 3; x++) { - afpvec4 v = lv[lv_offset + x]; + afpvec4 v = lv[lv_offset + y * (/*gl_WorkGroupSize.x*/8 + 2) + x]; afpmat4 k = afpmat4( - lk0[lw_offset + x], - lk1[lw_offset + x], - lk2[lw_offset + x], - lk3[lw_offset + x] + lk0[lw_offset + y * 3 + x], + lk1[lw_offset + y * 3 + x], + lk2[lw_offset + y * 3 + x], + lk3[lw_offset + y * 3 + x] ); sum += v * k; } - - lv_offset += int(/*gl_WorkGroupSize.x*/8 + 2); - lw_offset += 3; } v_offset += p.cstep; @@ -208,6 +210,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); } diff --git a/src/layer/vulkan/shader/convolution_pack4to1.comp b/src/layer/vulkan/shader/convolution_pack4to1.comp index b900e24ac..7242a8f8f 100644 --- a/src/layer/vulkan/shader/convolution_pack4to1.comp +++ b/src/layer/vulkan/shader/convolution_pack4to1.comp @@ -113,6 +113,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); } diff --git a/src/layer/vulkan/shader/convolutiondepthwise.comp b/src/layer/vulkan/shader/convolutiondepthwise.comp index 1599ae22c..6a12526e8 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise.comp @@ -107,6 +107,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); } diff --git a/src/layer/vulkan/shader/convolutiondepthwise_group.comp b/src/layer/vulkan/shader/convolutiondepthwise_group.comp index 604069b2d..9e99a8ba9 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise_group.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise_group.comp @@ -120,6 +120,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); } diff --git a/src/layer/vulkan/shader/convolutiondepthwise_group_pack1to4.comp b/src/layer/vulkan/shader/convolutiondepthwise_group_pack1to4.comp index 9e475ac4d..fbc403813 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise_group_pack1to4.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise_group_pack1to4.comp @@ -124,6 +124,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); } diff --git a/src/layer/vulkan/shader/convolutiondepthwise_group_pack4.comp b/src/layer/vulkan/shader/convolutiondepthwise_group_pack4.comp index 059a9e8f4..b62c86fbf 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise_group_pack4.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise_group_pack4.comp @@ -139,6 +139,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); } diff --git a/src/layer/vulkan/shader/convolutiondepthwise_group_pack4to1.comp b/src/layer/vulkan/shader/convolutiondepthwise_group_pack4to1.comp index 98422710e..72a46acb5 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise_group_pack4to1.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise_group_pack4to1.comp @@ -124,6 +124,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); } diff --git a/src/layer/vulkan/shader/convolutiondepthwise_pack4.comp b/src/layer/vulkan/shader/convolutiondepthwise_pack4.comp index bf81cda6c..6fd470d59 100644 --- a/src/layer/vulkan/shader/convolutiondepthwise_pack4.comp +++ b/src/layer/vulkan/shader/convolutiondepthwise_pack4.comp @@ -111,6 +111,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); } diff --git a/src/layer/vulkan/shader/deconvolution.comp b/src/layer/vulkan/shader/deconvolution.comp index 8393e04fd..cece3d975 100644 --- a/src/layer/vulkan/shader/deconvolution.comp +++ b/src/layer/vulkan/shader/deconvolution.comp @@ -129,6 +129,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); } diff --git a/src/layer/vulkan/shader/deconvolution_pack1to4.comp b/src/layer/vulkan/shader/deconvolution_pack1to4.comp index e4be824b2..a1e09629d 100644 --- a/src/layer/vulkan/shader/deconvolution_pack1to4.comp +++ b/src/layer/vulkan/shader/deconvolution_pack1to4.comp @@ -133,6 +133,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); } diff --git a/src/layer/vulkan/shader/deconvolution_pack4.comp b/src/layer/vulkan/shader/deconvolution_pack4.comp index 7325498ba..9d2e5d897 100644 --- a/src/layer/vulkan/shader/deconvolution_pack4.comp +++ b/src/layer/vulkan/shader/deconvolution_pack4.comp @@ -148,6 +148,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); } diff --git a/src/layer/vulkan/shader/deconvolution_pack4to1.comp b/src/layer/vulkan/shader/deconvolution_pack4to1.comp index a00e2a633..c14593481 100644 --- a/src/layer/vulkan/shader/deconvolution_pack4to1.comp +++ b/src/layer/vulkan/shader/deconvolution_pack4to1.comp @@ -133,6 +133,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise.comp b/src/layer/vulkan/shader/deconvolutiondepthwise.comp index 92ce29e6f..f71ae1c2f 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise.comp @@ -126,6 +126,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise_group.comp b/src/layer/vulkan/shader/deconvolutiondepthwise_group.comp index fc92b3825..8380c66da 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise_group.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise_group.comp @@ -138,6 +138,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to4.comp b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to4.comp index 4d2104305..cfebbbf86 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to4.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to4.comp @@ -142,6 +142,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4.comp b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4.comp index 09726b9ad..9a178c131 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4.comp @@ -157,6 +157,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to1.comp b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to1.comp index 1a471f06c..5689aecbd 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to1.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to1.comp @@ -142,6 +142,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); } diff --git a/src/layer/vulkan/shader/deconvolutiondepthwise_pack4.comp b/src/layer/vulkan/shader/deconvolutiondepthwise_pack4.comp index 712211bf8..6e4860d57 100644 --- a/src/layer/vulkan/shader/deconvolutiondepthwise_pack4.comp +++ b/src/layer/vulkan/shader/deconvolutiondepthwise_pack4.comp @@ -130,6 +130,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); } diff --git a/src/layer/vulkan/shader/innerproduct.comp b/src/layer/vulkan/shader/innerproduct.comp index c26d45a17..db4a05ff7 100644 --- a/src/layer/vulkan/shader/innerproduct.comp +++ b/src/layer/vulkan/shader/innerproduct.comp @@ -92,6 +92,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gx] = sfp(sum); } diff --git a/src/layer/vulkan/shader/innerproduct_pack1to4.comp b/src/layer/vulkan/shader/innerproduct_pack1to4.comp index 1b84e6d19..f88c37f4d 100644 --- a/src/layer/vulkan/shader/innerproduct_pack1to4.comp +++ b/src/layer/vulkan/shader/innerproduct_pack1to4.comp @@ -96,6 +96,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gx] = sfpvec4(sum); } diff --git a/src/layer/vulkan/shader/innerproduct_pack4.comp b/src/layer/vulkan/shader/innerproduct_pack4.comp index 7468cd689..61cd94320 100644 --- a/src/layer/vulkan/shader/innerproduct_pack4.comp +++ b/src/layer/vulkan/shader/innerproduct_pack4.comp @@ -111,6 +111,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gx] = sfpvec4(sum); } diff --git a/src/layer/vulkan/shader/innerproduct_pack4_lds_64.comp b/src/layer/vulkan/shader/innerproduct_pack4_lds_64.comp index 75c47a387..a52944fee 100644 --- a/src/layer/vulkan/shader/innerproduct_pack4_lds_64.comp +++ b/src/layer/vulkan/shader/innerproduct_pack4_lds_64.comp @@ -64,9 +64,6 @@ void main() int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= p.outw || gy >= 1 || gz >= 1) - return; - int lx = int(gl_LocalInvocationID.x); int ly = int(gl_LocalInvocationID.y); int lz = int(gl_LocalInvocationID.z); @@ -90,94 +87,103 @@ void main() barrier(); // load v to local cache - if (lz < 64) + if (lx < 64) { - lv[lz] = afpvec4(bottom_blob_data[i + lz]); + lv[lx] = afpvec4(bottom_blob_data[i + lx]); } barrier(); memoryBarrierShared(); - afpvec4 v = lv[lz]; + for (int j=0; j<64; j++) + { + afpvec4 v = lv[j]; #if NCNN_fp16_storage && !NCNN_fp16_arithmetic - // GL_EXT_shader_16bit_storage does not define f16mat4 type :( - afpmat4 k = afpmat4( - afpvec4(weight_data[(w_offset + i) * 4 + 0]), - afpvec4(weight_data[(w_offset + i) * 4 + 1]), - afpvec4(weight_data[(w_offset + i) * 4 + 2]), - afpvec4(weight_data[(w_offset + i) * 4 + 3]) - ); + // GL_EXT_shader_16bit_storage does not define f16mat4 type :( + afpmat4 k = afpmat4( + afpvec4(weight_data[(w_offset + i + j) * 4 + 0]), + afpvec4(weight_data[(w_offset + i + j) * 4 + 1]), + afpvec4(weight_data[(w_offset + i + j) * 4 + 2]), + afpvec4(weight_data[(w_offset + i + j) * 4 + 3]) + ); #else - afpmat4 k = afpmat4(weight_data[w_offset + i]); + afpmat4 k = afpmat4(weight_data[w_offset + i + j]); #endif - sum += v * k; + sum += v * k; + } } for (; i+15 < p.w; i+=16) { barrier(); // load v to local cache - if (lz < 16) + if (lx < 16) { - lv[lz] = afpvec4(bottom_blob_data[i + lz]); + lv[lx] = afpvec4(bottom_blob_data[i + lx]); } barrier(); memoryBarrierShared(); - afpvec4 v = lv[lz]; + for (int j=0; j<16; j++) + { + afpvec4 v = lv[j]; #if NCNN_fp16_storage && !NCNN_fp16_arithmetic - // GL_EXT_shader_16bit_storage does not define f16mat4 type :( - afpmat4 k = afpmat4( - afpvec4(weight_data[(w_offset + i) * 4 + 0]), - afpvec4(weight_data[(w_offset + i) * 4 + 1]), - afpvec4(weight_data[(w_offset + i) * 4 + 2]), - afpvec4(weight_data[(w_offset + i) * 4 + 3]) - ); + // GL_EXT_shader_16bit_storage does not define f16mat4 type :( + afpmat4 k = afpmat4( + afpvec4(weight_data[(w_offset + i + j) * 4 + 0]), + afpvec4(weight_data[(w_offset + i + j) * 4 + 1]), + afpvec4(weight_data[(w_offset + i + j) * 4 + 2]), + afpvec4(weight_data[(w_offset + i + j) * 4 + 3]) + ); #else - afpmat4 k = afpmat4(weight_data[w_offset + i]); + afpmat4 k = afpmat4(weight_data[w_offset + i + j]); #endif - sum += v * k; + sum += v * k; + } } for (; i+3 < p.w; i+=4) { barrier(); // load v to local cache - if (lz < 4) + if (lx < 4) { - lv[lz] = afpvec4(bottom_blob_data[i + lz]); + lv[lx] = afpvec4(bottom_blob_data[i + lx]); } barrier(); memoryBarrierShared(); - afpvec4 v = lv[lz]; + for (int j=0; j<4; j++) + { + afpvec4 v = lv[j]; #if NCNN_fp16_storage && !NCNN_fp16_arithmetic - // GL_EXT_shader_16bit_storage does not define f16mat4 type :( - afpmat4 k = afpmat4( - afpvec4(weight_data[(w_offset + i) * 4 + 0]), - afpvec4(weight_data[(w_offset + i) * 4 + 1]), - afpvec4(weight_data[(w_offset + i) * 4 + 2]), - afpvec4(weight_data[(w_offset + i) * 4 + 3]) - ); + // GL_EXT_shader_16bit_storage does not define f16mat4 type :( + afpmat4 k = afpmat4( + afpvec4(weight_data[(w_offset + i + j) * 4 + 0]), + afpvec4(weight_data[(w_offset + i + j) * 4 + 1]), + afpvec4(weight_data[(w_offset + i + j) * 4 + 2]), + afpvec4(weight_data[(w_offset + i + j) * 4 + 3]) + ); #else - afpmat4 k = afpmat4(weight_data[w_offset + i]); + afpmat4 k = afpmat4(weight_data[w_offset + i + j]); #endif - sum += v * k; + sum += v * k; + } } for (; i < p.w; i++) { barrier(); // load v to local cache - if (lz == 0) + if (lx == 0) { lv[0] = afpvec4(bottom_blob_data[i]); } @@ -202,6 +208,9 @@ void main() sum += v * k; } + if (gx >= p.outw || gy >= 1 || gz >= 1) + return; + if (activation_type == 1) { sum = max(sum, afp(0.f)); @@ -217,6 +226,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gx] = sfpvec4(sum); } diff --git a/src/layer/vulkan/shader/innerproduct_pack4to1.comp b/src/layer/vulkan/shader/innerproduct_pack4to1.comp index 5d3b1ab3c..fbc970119 100644 --- a/src/layer/vulkan/shader/innerproduct_pack4to1.comp +++ b/src/layer/vulkan/shader/innerproduct_pack4to1.comp @@ -96,6 +96,10 @@ void main() const afp const_max = afp(activation_param_1); sum = clamp(sum, const_min, const_max); } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } top_blob_data[gx] = sfp(sum); } diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp index 070a9ee8b..57d9c6e20 100644 --- a/src/layer/x86/convolution_x86.cpp +++ b/src/layer/x86/convolution_x86.cpp @@ -65,6 +65,13 @@ int Convolution_x86::create_pipeline(const Option& opt) pd.set(1, activation_params[1]);// max activation->load_param(pd); } + else if (activation_type == 4) + { + activation = ncnn::create_layer(ncnn::LayerType::Sigmoid); + + ncnn::ParamDict pd; + activation->load_param(pd); + } if (activation) { diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp index ea6026d23..0adaad109 100644 --- a/src/layer/x86/convolutiondepthwise_x86.cpp +++ b/src/layer/x86/convolutiondepthwise_x86.cpp @@ -62,6 +62,13 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt) pd.set(1, activation_params[1]);// max activation->load_param(pd); } + else if (activation_type == 4) + { + activation = ncnn::create_layer(ncnn::LayerType::Sigmoid); + + ncnn::ParamDict pd; + activation->load_param(pd); + } if (activation) { diff --git a/src/pipeline.cpp b/src/pipeline.cpp index 2d3a21ece..d666ffa2d 100644 --- a/src/pipeline.cpp +++ b/src/pipeline.cpp @@ -45,7 +45,7 @@ int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const char* { local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size); - fprintf(stderr, "local_shader_module %p %s created\n", local_shader_module, entry_name); +// fprintf(stderr, "local_shader_module %p %s created\n", local_shader_module, entry_name); create_descriptorset_layout(binding_count); diff --git a/tools/ncnnoptimize.cpp b/tools/ncnnoptimize.cpp index 8242607dc..26f763343 100644 --- a/tools/ncnnoptimize.cpp +++ b/tools/ncnnoptimize.cpp @@ -91,8 +91,10 @@ public: int eliminate_dropout(); int eliminate_flatten_after_global_pooling(); + int eliminate_flatten_after_innerproduct(); int replace_convolution_with_innerproduct_after_global_pooling(); + int replace_convolution_with_innerproduct_after_innerproduct(); public: int fprintf_param_int_array(int id, const ncnn::Mat& m, FILE* pp); @@ -662,7 +664,7 @@ int NetOptimize::fuse_convolution_activation() int j = i + 1; for (; jtype != "ReLU" && layers[j]->type != "Clip") + if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid") continue; if (layers[j]->bottoms.size() != 1) @@ -705,6 +707,10 @@ int NetOptimize::fuse_convolution_activation() convolution->activation_params[0] = clip->min; convolution->activation_params[1] = clip->max; } + else if (activation->type == "Sigmoid") + { + convolution->activation_type = 4; + } int top_blob_index_final = activation->tops[0]; convolution->tops[0] = top_blob_index_final; @@ -729,7 +735,7 @@ int NetOptimize::fuse_convolutiondepthwise_activation() int j = i + 1; for (; jtype != "ReLU" && layers[j]->type != "Clip") + if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid") continue; if (layers[j]->bottoms.size() != 1) @@ -772,6 +778,10 @@ int NetOptimize::fuse_convolutiondepthwise_activation() convolutiondepthwise->activation_params[0] = clip->min; convolutiondepthwise->activation_params[1] = clip->max; } + else if (activation->type == "Sigmoid") + { + convolutiondepthwise->activation_type = 4; + } int top_blob_index_final = activation->tops[0]; convolutiondepthwise->tops[0] = top_blob_index_final; @@ -796,7 +806,7 @@ int NetOptimize::fuse_deconvolution_activation() int j = i + 1; for (; jtype != "ReLU" && layers[j]->type != "Clip") + if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid") continue; if (layers[j]->bottoms.size() != 1) @@ -839,6 +849,10 @@ int NetOptimize::fuse_deconvolution_activation() deconvolution->activation_params[0] = clip->min; deconvolution->activation_params[1] = clip->max; } + else if (activation->type == "Sigmoid") + { + deconvolution->activation_type = 4; + } int top_blob_index_final = activation->tops[0]; deconvolution->tops[0] = top_blob_index_final; @@ -863,7 +877,7 @@ int NetOptimize::fuse_deconvolutiondepthwise_activation() int j = i + 1; for (; jtype != "ReLU" && layers[j]->type != "Clip") + if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid") continue; if (layers[j]->bottoms.size() != 1) @@ -906,6 +920,10 @@ int NetOptimize::fuse_deconvolutiondepthwise_activation() deconvolutiondepthwise->activation_params[0] = clip->min; deconvolutiondepthwise->activation_params[1] = clip->max; } + else if (activation->type == "Sigmoid") + { + deconvolutiondepthwise->activation_type = 4; + } int top_blob_index_final = activation->tops[0]; deconvolutiondepthwise->tops[0] = top_blob_index_final; @@ -930,7 +948,7 @@ int NetOptimize::fuse_innerproduct_activation() int j = i + 1; for (; jtype != "ReLU" && layers[j]->type != "Clip") + if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid") continue; if (layers[j]->bottoms.size() != 1) @@ -973,6 +991,10 @@ int NetOptimize::fuse_innerproduct_activation() innerproduct->activation_params[0] = clip->min; innerproduct->activation_params[1] = clip->max; } + else if (activation->type == "Sigmoid") + { + innerproduct->activation_type = 4; + } int top_blob_index_final = activation->tops[0]; innerproduct->tops[0] = top_blob_index_final; @@ -1071,6 +1093,47 @@ int NetOptimize::eliminate_flatten_after_global_pooling() return 0; } +int NetOptimize::eliminate_flatten_after_innerproduct() +{ + const int layer_count = layers.size(); + for (int i=0; itype != "InnerProduct") + continue; + + // InnerProduct - Flatten + int top_blob_index = layers[i]->tops[0]; + + int j = i + 1; + for (; jtype != "Flatten") + continue; + + if (layers[j]->bottoms.size() != 1) + continue; + + if (layers[j]->bottoms[0] == top_blob_index) + break; + } + + if (j == layer_count) + continue; + + ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i]; + ncnn::Flatten* flatten = (ncnn::Flatten*)layers[j]; + + fprintf(stderr, "eliminate_flatten_after_innerproduct %s %s\n", innerproduct->name.c_str(), flatten->name.c_str()); + + int top_blob_index_final = flatten->tops[0]; + innerproduct->tops[0] = top_blob_index_final; + blobs[top_blob_index_final].producer = i; + flatten->type = "ncnnfused"; + } + + return 0; +} + int NetOptimize::replace_convolution_with_innerproduct_after_global_pooling() { const int layer_count = layers.size(); @@ -1123,6 +1186,9 @@ int NetOptimize::replace_convolution_with_innerproduct_after_global_pooling() innerproduct->weight_data = convolution->weight_data; innerproduct->bias_data = convolution->bias_data; + innerproduct->activation_type = convolution->activation_type; + innerproduct->activation_params = convolution->activation_params; + layers[j] = innerproduct; delete convolution; } @@ -1130,6 +1196,75 @@ int NetOptimize::replace_convolution_with_innerproduct_after_global_pooling() return 0; } +int NetOptimize::replace_convolution_with_innerproduct_after_innerproduct() +{ + const int layer_count = layers.size(); + for (;;) + { + bool replaced = false; + + for (int i=0; itype != "InnerProduct") + continue; + + // InnerProduct - Convolution + int top_blob_index = layers[i]->tops[0]; + + int j = i + 1; + for (; jtype != "Convolution") + continue; + + if (layers[j]->bottoms.size() != 1) + continue; + + if (layers[j]->bottoms[0] == top_blob_index) + break; + } + + if (j == layer_count) + continue; + + ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i]; + ncnn::Convolution* convolution = (ncnn::Convolution*)layers[j]; + + fprintf(stderr, "replace_convolution_with_innerproduct_after_innerproduct %s %s\n", innerproduct->name.c_str(), convolution->name.c_str()); + + ncnn::InnerProduct* innerproduct2 = (ncnn::InnerProduct*)ncnn::create_layer("InnerProduct"); + + innerproduct2->type = "InnerProduct"; + innerproduct2->name = convolution->name; + innerproduct2->bottoms = convolution->bottoms; + innerproduct2->tops = convolution->tops; + + ncnn::ParamDict pd; + innerproduct2->load_param(pd); + + innerproduct2->num_output = convolution->num_output; + innerproduct2->bias_term = convolution->bias_term; + innerproduct2->weight_data_size = convolution->weight_data_size; + + innerproduct2->weight_data = convolution->weight_data; + innerproduct2->bias_data = convolution->bias_data; + + innerproduct2->activation_type = convolution->activation_type; + innerproduct2->activation_params = convolution->activation_params; + + layers[j] = innerproduct2; + delete convolution; + + replaced = true; + } + + if (!replaced) + break; + } + + return 0; +} + int NetOptimize::fprintf_param_int_array(int id, const ncnn::Mat& m, FILE* pp) { const int count = m.w; @@ -1158,8 +1293,15 @@ int NetOptimize::fprintf_param_float_array(int id, const ncnn::Mat& m, FILE* pp) return 0; } +static inline size_t alignSize(size_t sz, int n) +{ + return (sz + n-1) & -n; +} + int NetOptimize::fwrite_weight_tag_data(int tag, const ncnn::Mat& data, FILE* bp) { + int p0 = ftell(bp); + ncnn::Mat data_flattened = data.reshape(data.w * data.h * data.c); if (storage_type == 1 && tag == 0) { @@ -1174,13 +1316,29 @@ int NetOptimize::fwrite_weight_tag_data(int tag, const ncnn::Mat& data, FILE* bp fwrite(&tag, sizeof(int), 1, bp); fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); } + + // padding to 32bit align + int nwrite = ftell(bp) - p0; + int nalign = alignSize(nwrite, 4); + unsigned char padding[4] = {0x00, 0x00, 0x00, 0x00}; + fwrite(padding, sizeof(unsigned char), nalign - nwrite, bp); + return 0; } int NetOptimize::fwrite_weight_data(const ncnn::Mat& data, FILE* bp) { + int p0 = ftell(bp); + ncnn::Mat data_flattened = data.reshape(data.w * data.h * data.c); fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); + + // padding to 32bit align + int nwrite = ftell(bp) - p0; + int nalign = alignSize(nwrite, 4); + unsigned char padding[4] = {0x00, 0x00, 0x00, 0x00}; + fwrite(padding, sizeof(unsigned char), nalign - nwrite, bp); + return 0; } @@ -1835,6 +1993,9 @@ int main(int argc, char** argv) optimizer.eliminate_flatten_after_global_pooling(); optimizer.replace_convolution_with_innerproduct_after_global_pooling(); + optimizer.replace_convolution_with_innerproduct_after_innerproduct(); + + optimizer.eliminate_flatten_after_innerproduct(); optimizer.save(outparam, outbin);