| @@ -64,6 +64,13 @@ int Convolution_arm::create_pipeline(const Option& opt) | |||
| pd.set(1, activation_params[1]);// max | |||
| activation->load_param(pd); | |||
| } | |||
| else if (activation_type == 4) | |||
| { | |||
| activation = ncnn::create_layer(ncnn::LayerType::Sigmoid); | |||
| ncnn::ParamDict pd; | |||
| activation->load_param(pd); | |||
| } | |||
| if (activation) | |||
| { | |||
| @@ -62,6 +62,13 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) | |||
| pd.set(1, activation_params[1]);// max | |||
| activation->load_param(pd); | |||
| } | |||
| else if (activation_type == 4) | |||
| { | |||
| activation = ncnn::create_layer(ncnn::LayerType::Sigmoid); | |||
| ncnn::ParamDict pd; | |||
| activation->load_param(pd); | |||
| } | |||
| if (activation) | |||
| { | |||
| @@ -53,6 +53,13 @@ int Deconvolution_arm::create_pipeline(const Option& opt) | |||
| pd.set(1, activation_params[1]);// max | |||
| activation->load_param(pd); | |||
| } | |||
| else if (activation_type == 4) | |||
| { | |||
| activation = ncnn::create_layer(ncnn::LayerType::Sigmoid); | |||
| ncnn::ParamDict pd; | |||
| activation->load_param(pd); | |||
| } | |||
| if (activation) | |||
| { | |||
| @@ -50,6 +50,13 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt) | |||
| pd.set(1, activation_params[1]);// max | |||
| activation->load_param(pd); | |||
| } | |||
| else if (activation_type == 4) | |||
| { | |||
| activation = ncnn::create_layer(ncnn::LayerType::Sigmoid); | |||
| ncnn::ParamDict pd; | |||
| activation->load_param(pd); | |||
| } | |||
| if (activation) | |||
| { | |||
| @@ -299,6 +299,10 @@ int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Optio | |||
| if (sum > max) | |||
| sum = max; | |||
| } | |||
| else if (activation_type == 4) | |||
| { | |||
| sum = 1.f / (1.f + exp(-sum)); | |||
| } | |||
| top_blob[p] = sum; | |||
| } | |||
| @@ -551,6 +551,10 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op | |||
| if (sum > max) | |||
| sum = max; | |||
| } | |||
| else if (activation_type == 4) | |||
| { | |||
| sum = 1.f / (1.f + exp(-sum)); | |||
| } | |||
| outptr[j] = sum; | |||
| } | |||
| @@ -52,7 +52,7 @@ public: | |||
| int int8_scale_term; | |||
| // 0=none 1=relu 2=leakyrelu 3=clip | |||
| // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid | |||
| int activation_type; | |||
| Mat activation_params; | |||
| @@ -655,6 +655,10 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O | |||
| if (sum > max) | |||
| sum = max; | |||
| } | |||
| else if (activation_type == 4) | |||
| { | |||
| sum = 1.f / (1.f + exp(-sum)); | |||
| } | |||
| outptr[j] = sum; | |||
| } | |||
| @@ -726,6 +730,10 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O | |||
| if (sum > max) | |||
| sum = max; | |||
| } | |||
| else if (activation_type == 4) | |||
| { | |||
| sum = 1.f / (1.f + exp(-sum)); | |||
| } | |||
| outptr[j] = sum; | |||
| } | |||
| @@ -53,7 +53,7 @@ public: | |||
| int int8_scale_term; | |||
| // 0=none 1=relu 2=leakyrelu 3=clip | |||
| // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid | |||
| int activation_type; | |||
| Mat activation_params; | |||
| @@ -186,6 +186,16 @@ int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& | |||
| outptr[i] = max; | |||
| } | |||
| } | |||
| else if (activation_type == 4) | |||
| { | |||
| float* outptr = out; | |||
| int size = outw * outh; | |||
| for (int i = 0; i < size; i++) | |||
| { | |||
| outptr[i] = 1.f / (1.f + exp(-outptr[i])); | |||
| } | |||
| } | |||
| } | |||
| if (pad_w > 0 || pad_h > 0) | |||
| @@ -45,7 +45,7 @@ public: | |||
| int weight_data_size; | |||
| // 0=none 1=relu 2=leakyrelu 3=clip | |||
| // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid | |||
| int activation_type; | |||
| Mat activation_params; | |||
| @@ -186,6 +186,16 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const | |||
| outptr[i] = max; | |||
| } | |||
| } | |||
| else if (activation_type == 4) | |||
| { | |||
| float* outptr = m; | |||
| int size = outw * outh; | |||
| for (int i = 0; i < size; i++) | |||
| { | |||
| outptr[i] = 1.f / (1.f + exp(-outptr[i])); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| else | |||
| @@ -270,6 +280,16 @@ int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const | |||
| outptr[i] = max; | |||
| } | |||
| } | |||
| else if (activation_type == 4) | |||
| { | |||
| float* outptr = out; | |||
| int size = outw * outh; | |||
| for (int i = 0; i < size; i++) | |||
| { | |||
| outptr[i] = 1.f / (1.f + exp(-outptr[i])); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -46,7 +46,7 @@ public: | |||
| int weight_data_size; | |||
| int group; | |||
| // 0=none 1=relu 2=leakyrelu 3=clip | |||
| // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid | |||
| int activation_type; | |||
| Mat activation_params; | |||
| @@ -289,6 +289,10 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o | |||
| if (sum > max) | |||
| sum = max; | |||
| } | |||
| else if (activation_type == 4) | |||
| { | |||
| sum = 1.f / (1.f + exp(-sum)); | |||
| } | |||
| top_blob[p] = sum; | |||
| } | |||
| @@ -42,7 +42,7 @@ public: | |||
| int int8_scale_term; | |||
| // 0=none 1=relu 2=leakyrelu 3=clip | |||
| // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid | |||
| int activation_type; | |||
| Mat activation_params; | |||
| @@ -109,6 +109,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); | |||
| } | |||
| @@ -94,6 +94,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep_4 + gx] = sfpvec4(sum); | |||
| } | |||
| @@ -28,6 +28,9 @@ layout (constant_id = 3) const int dilation_h = 1; | |||
| layout (constant_id = 4) const int stride_w = 1; | |||
| layout (constant_id = 5) const int stride_h = 1; | |||
| layout (constant_id = 6) const int bias_term = 0; | |||
| layout (constant_id = 7) const int activation_type = 0; | |||
| layout (constant_id = 8) const float activation_param_0 = 0; | |||
| layout (constant_id = 9) const float activation_param_1 = 0; | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| @@ -95,5 +98,25 @@ void main() | |||
| } | |||
| } | |||
| if (activation_type == 1) | |||
| { | |||
| sum = max(sum, afp(0.f)); | |||
| } | |||
| if (activation_type == 2) | |||
| { | |||
| const afp slope = afp(activation_param_0); | |||
| sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f))); | |||
| } | |||
| if (activation_type == 3) | |||
| { | |||
| const afp const_min = afp(activation_param_0); | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); | |||
| } | |||
| @@ -128,6 +128,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); | |||
| } | |||
| @@ -88,20 +88,25 @@ void main() | |||
| int v_offset = gy * p.w + gx; | |||
| int w_offset = gz * p.c * 3 * 3; | |||
| int lv_offset = ly * (/*gl_WorkGroupSize.x*/8 + 2) + lx; | |||
| int lw_offset = lz * 3 * 3; | |||
| for (int z = 0; z < p.c; z++) | |||
| { | |||
| int lv_offset = ly * int(/*gl_WorkGroupSize.x*/8 + 2) + lx; | |||
| int lw_offset = lz * 3 * 3; | |||
| barrier(); | |||
| // load v to local cache | |||
| if (lz == 0) | |||
| { | |||
| int v_offset_1 = v_offset + p.w + 1; | |||
| int lv_offset_1 = lv_offset + int(/*gl_WorkGroupSize.x*/8 + 2) + 1; | |||
| int lv_offset_1 = lv_offset + (/*gl_WorkGroupSize.x*/8 + 2) + 1; | |||
| lv[lv_offset_1] = afpvec4(bottom_blob_data[v_offset_1]); | |||
| } | |||
| else | |||
| { | |||
| int v_offset_1 = v_offset + p.w + 1; | |||
| int lv_offset_1 = lv_offset + (/*gl_WorkGroupSize.x*/8 + 2) + 1; | |||
| // left and right border | |||
| if (lx == 0) | |||
| @@ -170,20 +175,17 @@ void main() | |||
| { | |||
| for (int x = 0; x < 3; x++) | |||
| { | |||
| afpvec4 v = lv[lv_offset + x]; | |||
| afpvec4 v = lv[lv_offset + y * (/*gl_WorkGroupSize.x*/8 + 2) + x]; | |||
| afpmat4 k = afpmat4( | |||
| lk0[lw_offset + x], | |||
| lk1[lw_offset + x], | |||
| lk2[lw_offset + x], | |||
| lk3[lw_offset + x] | |||
| lk0[lw_offset + y * 3 + x], | |||
| lk1[lw_offset + y * 3 + x], | |||
| lk2[lw_offset + y * 3 + x], | |||
| lk3[lw_offset + y * 3 + x] | |||
| ); | |||
| sum += v * k; | |||
| } | |||
| lv_offset += int(/*gl_WorkGroupSize.x*/8 + 2); | |||
| lw_offset += 3; | |||
| } | |||
| v_offset += p.cstep; | |||
| @@ -208,6 +210,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); | |||
| } | |||
| @@ -113,6 +113,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); | |||
| } | |||
| @@ -107,6 +107,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); | |||
| } | |||
| @@ -120,6 +120,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); | |||
| } | |||
| @@ -124,6 +124,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); | |||
| } | |||
| @@ -139,6 +139,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); | |||
| } | |||
| @@ -124,6 +124,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); | |||
| } | |||
| @@ -111,6 +111,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); | |||
| } | |||
| @@ -129,6 +129,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); | |||
| } | |||
| @@ -133,6 +133,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); | |||
| } | |||
| @@ -148,6 +148,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); | |||
| } | |||
| @@ -133,6 +133,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); | |||
| } | |||
| @@ -126,6 +126,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); | |||
| } | |||
| @@ -138,6 +138,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); | |||
| } | |||
| @@ -142,6 +142,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); | |||
| } | |||
| @@ -157,6 +157,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); | |||
| } | |||
| @@ -142,6 +142,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(sum); | |||
| } | |||
| @@ -130,6 +130,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(sum); | |||
| } | |||
| @@ -92,6 +92,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gx] = sfp(sum); | |||
| } | |||
| @@ -96,6 +96,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gx] = sfpvec4(sum); | |||
| } | |||
| @@ -111,6 +111,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gx] = sfpvec4(sum); | |||
| } | |||
| @@ -64,9 +64,6 @@ void main() | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.outw || gy >= 1 || gz >= 1) | |||
| return; | |||
| int lx = int(gl_LocalInvocationID.x); | |||
| int ly = int(gl_LocalInvocationID.y); | |||
| int lz = int(gl_LocalInvocationID.z); | |||
| @@ -90,94 +87,103 @@ void main() | |||
| barrier(); | |||
| // load v to local cache | |||
| if (lz < 64) | |||
| if (lx < 64) | |||
| { | |||
| lv[lz] = afpvec4(bottom_blob_data[i + lz]); | |||
| lv[lx] = afpvec4(bottom_blob_data[i + lx]); | |||
| } | |||
| barrier(); | |||
| memoryBarrierShared(); | |||
| afpvec4 v = lv[lz]; | |||
| for (int j=0; j<64; j++) | |||
| { | |||
| afpvec4 v = lv[j]; | |||
| #if NCNN_fp16_storage && !NCNN_fp16_arithmetic | |||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||
| afpmat4 k = afpmat4( | |||
| afpvec4(weight_data[(w_offset + i) * 4 + 0]), | |||
| afpvec4(weight_data[(w_offset + i) * 4 + 1]), | |||
| afpvec4(weight_data[(w_offset + i) * 4 + 2]), | |||
| afpvec4(weight_data[(w_offset + i) * 4 + 3]) | |||
| ); | |||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||
| afpmat4 k = afpmat4( | |||
| afpvec4(weight_data[(w_offset + i + j) * 4 + 0]), | |||
| afpvec4(weight_data[(w_offset + i + j) * 4 + 1]), | |||
| afpvec4(weight_data[(w_offset + i + j) * 4 + 2]), | |||
| afpvec4(weight_data[(w_offset + i + j) * 4 + 3]) | |||
| ); | |||
| #else | |||
| afpmat4 k = afpmat4(weight_data[w_offset + i]); | |||
| afpmat4 k = afpmat4(weight_data[w_offset + i + j]); | |||
| #endif | |||
| sum += v * k; | |||
| sum += v * k; | |||
| } | |||
| } | |||
| for (; i+15 < p.w; i+=16) | |||
| { | |||
| barrier(); | |||
| // load v to local cache | |||
| if (lz < 16) | |||
| if (lx < 16) | |||
| { | |||
| lv[lz] = afpvec4(bottom_blob_data[i + lz]); | |||
| lv[lx] = afpvec4(bottom_blob_data[i + lx]); | |||
| } | |||
| barrier(); | |||
| memoryBarrierShared(); | |||
| afpvec4 v = lv[lz]; | |||
| for (int j=0; j<16; j++) | |||
| { | |||
| afpvec4 v = lv[j]; | |||
| #if NCNN_fp16_storage && !NCNN_fp16_arithmetic | |||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||
| afpmat4 k = afpmat4( | |||
| afpvec4(weight_data[(w_offset + i) * 4 + 0]), | |||
| afpvec4(weight_data[(w_offset + i) * 4 + 1]), | |||
| afpvec4(weight_data[(w_offset + i) * 4 + 2]), | |||
| afpvec4(weight_data[(w_offset + i) * 4 + 3]) | |||
| ); | |||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||
| afpmat4 k = afpmat4( | |||
| afpvec4(weight_data[(w_offset + i + j) * 4 + 0]), | |||
| afpvec4(weight_data[(w_offset + i + j) * 4 + 1]), | |||
| afpvec4(weight_data[(w_offset + i + j) * 4 + 2]), | |||
| afpvec4(weight_data[(w_offset + i + j) * 4 + 3]) | |||
| ); | |||
| #else | |||
| afpmat4 k = afpmat4(weight_data[w_offset + i]); | |||
| afpmat4 k = afpmat4(weight_data[w_offset + i + j]); | |||
| #endif | |||
| sum += v * k; | |||
| sum += v * k; | |||
| } | |||
| } | |||
| for (; i+3 < p.w; i+=4) | |||
| { | |||
| barrier(); | |||
| // load v to local cache | |||
| if (lz < 4) | |||
| if (lx < 4) | |||
| { | |||
| lv[lz] = afpvec4(bottom_blob_data[i + lz]); | |||
| lv[lx] = afpvec4(bottom_blob_data[i + lx]); | |||
| } | |||
| barrier(); | |||
| memoryBarrierShared(); | |||
| afpvec4 v = lv[lz]; | |||
| for (int j=0; j<4; j++) | |||
| { | |||
| afpvec4 v = lv[j]; | |||
| #if NCNN_fp16_storage && !NCNN_fp16_arithmetic | |||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||
| afpmat4 k = afpmat4( | |||
| afpvec4(weight_data[(w_offset + i) * 4 + 0]), | |||
| afpvec4(weight_data[(w_offset + i) * 4 + 1]), | |||
| afpvec4(weight_data[(w_offset + i) * 4 + 2]), | |||
| afpvec4(weight_data[(w_offset + i) * 4 + 3]) | |||
| ); | |||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||
| afpmat4 k = afpmat4( | |||
| afpvec4(weight_data[(w_offset + i + j) * 4 + 0]), | |||
| afpvec4(weight_data[(w_offset + i + j) * 4 + 1]), | |||
| afpvec4(weight_data[(w_offset + i + j) * 4 + 2]), | |||
| afpvec4(weight_data[(w_offset + i + j) * 4 + 3]) | |||
| ); | |||
| #else | |||
| afpmat4 k = afpmat4(weight_data[w_offset + i]); | |||
| afpmat4 k = afpmat4(weight_data[w_offset + i + j]); | |||
| #endif | |||
| sum += v * k; | |||
| sum += v * k; | |||
| } | |||
| } | |||
| for (; i < p.w; i++) | |||
| { | |||
| barrier(); | |||
| // load v to local cache | |||
| if (lz == 0) | |||
| if (lx == 0) | |||
| { | |||
| lv[0] = afpvec4(bottom_blob_data[i]); | |||
| } | |||
| @@ -202,6 +208,9 @@ void main() | |||
| sum += v * k; | |||
| } | |||
| if (gx >= p.outw || gy >= 1 || gz >= 1) | |||
| return; | |||
| if (activation_type == 1) | |||
| { | |||
| sum = max(sum, afp(0.f)); | |||
| @@ -217,6 +226,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gx] = sfpvec4(sum); | |||
| } | |||
| @@ -96,6 +96,10 @@ void main() | |||
| const afp const_max = afp(activation_param_1); | |||
| sum = clamp(sum, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| sum = afp(1.f) / (afp(1.f) + exp(-sum)); | |||
| } | |||
| top_blob_data[gx] = sfp(sum); | |||
| } | |||
| @@ -65,6 +65,13 @@ int Convolution_x86::create_pipeline(const Option& opt) | |||
| pd.set(1, activation_params[1]);// max | |||
| activation->load_param(pd); | |||
| } | |||
| else if (activation_type == 4) | |||
| { | |||
| activation = ncnn::create_layer(ncnn::LayerType::Sigmoid); | |||
| ncnn::ParamDict pd; | |||
| activation->load_param(pd); | |||
| } | |||
| if (activation) | |||
| { | |||
| @@ -62,6 +62,13 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt) | |||
| pd.set(1, activation_params[1]);// max | |||
| activation->load_param(pd); | |||
| } | |||
| else if (activation_type == 4) | |||
| { | |||
| activation = ncnn::create_layer(ncnn::LayerType::Sigmoid); | |||
| ncnn::ParamDict pd; | |||
| activation->load_param(pd); | |||
| } | |||
| if (activation) | |||
| { | |||
| @@ -45,7 +45,7 @@ int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const char* | |||
| { | |||
| local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size); | |||
| fprintf(stderr, "local_shader_module %p %s created\n", local_shader_module, entry_name); | |||
| // fprintf(stderr, "local_shader_module %p %s created\n", local_shader_module, entry_name); | |||
| create_descriptorset_layout(binding_count); | |||
| @@ -91,8 +91,10 @@ public: | |||
| int eliminate_dropout(); | |||
| int eliminate_flatten_after_global_pooling(); | |||
| int eliminate_flatten_after_innerproduct(); | |||
| int replace_convolution_with_innerproduct_after_global_pooling(); | |||
| int replace_convolution_with_innerproduct_after_innerproduct(); | |||
| public: | |||
| int fprintf_param_int_array(int id, const ncnn::Mat& m, FILE* pp); | |||
| @@ -662,7 +664,7 @@ int NetOptimize::fuse_convolution_activation() | |||
| int j = i + 1; | |||
| for (; j<layer_count; j++) | |||
| { | |||
| if (layers[j]->type != "ReLU" && layers[j]->type != "Clip") | |||
| if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid") | |||
| continue; | |||
| if (layers[j]->bottoms.size() != 1) | |||
| @@ -705,6 +707,10 @@ int NetOptimize::fuse_convolution_activation() | |||
| convolution->activation_params[0] = clip->min; | |||
| convolution->activation_params[1] = clip->max; | |||
| } | |||
| else if (activation->type == "Sigmoid") | |||
| { | |||
| convolution->activation_type = 4; | |||
| } | |||
| int top_blob_index_final = activation->tops[0]; | |||
| convolution->tops[0] = top_blob_index_final; | |||
| @@ -729,7 +735,7 @@ int NetOptimize::fuse_convolutiondepthwise_activation() | |||
| int j = i + 1; | |||
| for (; j<layer_count; j++) | |||
| { | |||
| if (layers[j]->type != "ReLU" && layers[j]->type != "Clip") | |||
| if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid") | |||
| continue; | |||
| if (layers[j]->bottoms.size() != 1) | |||
| @@ -772,6 +778,10 @@ int NetOptimize::fuse_convolutiondepthwise_activation() | |||
| convolutiondepthwise->activation_params[0] = clip->min; | |||
| convolutiondepthwise->activation_params[1] = clip->max; | |||
| } | |||
| else if (activation->type == "Sigmoid") | |||
| { | |||
| convolutiondepthwise->activation_type = 4; | |||
| } | |||
| int top_blob_index_final = activation->tops[0]; | |||
| convolutiondepthwise->tops[0] = top_blob_index_final; | |||
| @@ -796,7 +806,7 @@ int NetOptimize::fuse_deconvolution_activation() | |||
| int j = i + 1; | |||
| for (; j<layer_count; j++) | |||
| { | |||
| if (layers[j]->type != "ReLU" && layers[j]->type != "Clip") | |||
| if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid") | |||
| continue; | |||
| if (layers[j]->bottoms.size() != 1) | |||
| @@ -839,6 +849,10 @@ int NetOptimize::fuse_deconvolution_activation() | |||
| deconvolution->activation_params[0] = clip->min; | |||
| deconvolution->activation_params[1] = clip->max; | |||
| } | |||
| else if (activation->type == "Sigmoid") | |||
| { | |||
| deconvolution->activation_type = 4; | |||
| } | |||
| int top_blob_index_final = activation->tops[0]; | |||
| deconvolution->tops[0] = top_blob_index_final; | |||
| @@ -863,7 +877,7 @@ int NetOptimize::fuse_deconvolutiondepthwise_activation() | |||
| int j = i + 1; | |||
| for (; j<layer_count; j++) | |||
| { | |||
| if (layers[j]->type != "ReLU" && layers[j]->type != "Clip") | |||
| if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid") | |||
| continue; | |||
| if (layers[j]->bottoms.size() != 1) | |||
| @@ -906,6 +920,10 @@ int NetOptimize::fuse_deconvolutiondepthwise_activation() | |||
| deconvolutiondepthwise->activation_params[0] = clip->min; | |||
| deconvolutiondepthwise->activation_params[1] = clip->max; | |||
| } | |||
| else if (activation->type == "Sigmoid") | |||
| { | |||
| deconvolutiondepthwise->activation_type = 4; | |||
| } | |||
| int top_blob_index_final = activation->tops[0]; | |||
| deconvolutiondepthwise->tops[0] = top_blob_index_final; | |||
| @@ -930,7 +948,7 @@ int NetOptimize::fuse_innerproduct_activation() | |||
| int j = i + 1; | |||
| for (; j<layer_count; j++) | |||
| { | |||
| if (layers[j]->type != "ReLU" && layers[j]->type != "Clip") | |||
| if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid") | |||
| continue; | |||
| if (layers[j]->bottoms.size() != 1) | |||
| @@ -973,6 +991,10 @@ int NetOptimize::fuse_innerproduct_activation() | |||
| innerproduct->activation_params[0] = clip->min; | |||
| innerproduct->activation_params[1] = clip->max; | |||
| } | |||
| else if (activation->type == "Sigmoid") | |||
| { | |||
| innerproduct->activation_type = 4; | |||
| } | |||
| int top_blob_index_final = activation->tops[0]; | |||
| innerproduct->tops[0] = top_blob_index_final; | |||
| @@ -1071,6 +1093,47 @@ int NetOptimize::eliminate_flatten_after_global_pooling() | |||
| return 0; | |||
| } | |||
| int NetOptimize::eliminate_flatten_after_innerproduct() | |||
| { | |||
| const int layer_count = layers.size(); | |||
| for (int i=0; i<layer_count; i++) | |||
| { | |||
| if (layers[i]->type != "InnerProduct") | |||
| continue; | |||
| // InnerProduct - Flatten | |||
| int top_blob_index = layers[i]->tops[0]; | |||
| int j = i + 1; | |||
| for (; j<layer_count; j++) | |||
| { | |||
| if (layers[j]->type != "Flatten") | |||
| continue; | |||
| if (layers[j]->bottoms.size() != 1) | |||
| continue; | |||
| if (layers[j]->bottoms[0] == top_blob_index) | |||
| break; | |||
| } | |||
| if (j == layer_count) | |||
| continue; | |||
| ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i]; | |||
| ncnn::Flatten* flatten = (ncnn::Flatten*)layers[j]; | |||
| fprintf(stderr, "eliminate_flatten_after_innerproduct %s %s\n", innerproduct->name.c_str(), flatten->name.c_str()); | |||
| int top_blob_index_final = flatten->tops[0]; | |||
| innerproduct->tops[0] = top_blob_index_final; | |||
| blobs[top_blob_index_final].producer = i; | |||
| flatten->type = "ncnnfused"; | |||
| } | |||
| return 0; | |||
| } | |||
| int NetOptimize::replace_convolution_with_innerproduct_after_global_pooling() | |||
| { | |||
| const int layer_count = layers.size(); | |||
| @@ -1123,6 +1186,9 @@ int NetOptimize::replace_convolution_with_innerproduct_after_global_pooling() | |||
| innerproduct->weight_data = convolution->weight_data; | |||
| innerproduct->bias_data = convolution->bias_data; | |||
| innerproduct->activation_type = convolution->activation_type; | |||
| innerproduct->activation_params = convolution->activation_params; | |||
| layers[j] = innerproduct; | |||
| delete convolution; | |||
| } | |||
| @@ -1130,6 +1196,75 @@ int NetOptimize::replace_convolution_with_innerproduct_after_global_pooling() | |||
| return 0; | |||
| } | |||
| int NetOptimize::replace_convolution_with_innerproduct_after_innerproduct() | |||
| { | |||
| const int layer_count = layers.size(); | |||
| for (;;) | |||
| { | |||
| bool replaced = false; | |||
| for (int i=0; i<layer_count; i++) | |||
| { | |||
| if (layers[i]->type != "InnerProduct") | |||
| continue; | |||
| // InnerProduct - Convolution | |||
| int top_blob_index = layers[i]->tops[0]; | |||
| int j = i + 1; | |||
| for (; j<layer_count; j++) | |||
| { | |||
| if (layers[j]->type != "Convolution") | |||
| continue; | |||
| if (layers[j]->bottoms.size() != 1) | |||
| continue; | |||
| if (layers[j]->bottoms[0] == top_blob_index) | |||
| break; | |||
| } | |||
| if (j == layer_count) | |||
| continue; | |||
| ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i]; | |||
| ncnn::Convolution* convolution = (ncnn::Convolution*)layers[j]; | |||
| fprintf(stderr, "replace_convolution_with_innerproduct_after_innerproduct %s %s\n", innerproduct->name.c_str(), convolution->name.c_str()); | |||
| ncnn::InnerProduct* innerproduct2 = (ncnn::InnerProduct*)ncnn::create_layer("InnerProduct"); | |||
| innerproduct2->type = "InnerProduct"; | |||
| innerproduct2->name = convolution->name; | |||
| innerproduct2->bottoms = convolution->bottoms; | |||
| innerproduct2->tops = convolution->tops; | |||
| ncnn::ParamDict pd; | |||
| innerproduct2->load_param(pd); | |||
| innerproduct2->num_output = convolution->num_output; | |||
| innerproduct2->bias_term = convolution->bias_term; | |||
| innerproduct2->weight_data_size = convolution->weight_data_size; | |||
| innerproduct2->weight_data = convolution->weight_data; | |||
| innerproduct2->bias_data = convolution->bias_data; | |||
| innerproduct2->activation_type = convolution->activation_type; | |||
| innerproduct2->activation_params = convolution->activation_params; | |||
| layers[j] = innerproduct2; | |||
| delete convolution; | |||
| replaced = true; | |||
| } | |||
| if (!replaced) | |||
| break; | |||
| } | |||
| return 0; | |||
| } | |||
| int NetOptimize::fprintf_param_int_array(int id, const ncnn::Mat& m, FILE* pp) | |||
| { | |||
| const int count = m.w; | |||
| @@ -1158,8 +1293,15 @@ int NetOptimize::fprintf_param_float_array(int id, const ncnn::Mat& m, FILE* pp) | |||
| return 0; | |||
| } | |||
| static inline size_t alignSize(size_t sz, int n) | |||
| { | |||
| return (sz + n-1) & -n; | |||
| } | |||
| int NetOptimize::fwrite_weight_tag_data(int tag, const ncnn::Mat& data, FILE* bp) | |||
| { | |||
| int p0 = ftell(bp); | |||
| ncnn::Mat data_flattened = data.reshape(data.w * data.h * data.c); | |||
| if (storage_type == 1 && tag == 0) | |||
| { | |||
| @@ -1174,13 +1316,29 @@ int NetOptimize::fwrite_weight_tag_data(int tag, const ncnn::Mat& data, FILE* bp | |||
| fwrite(&tag, sizeof(int), 1, bp); | |||
| fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); | |||
| } | |||
| // padding to 32bit align | |||
| int nwrite = ftell(bp) - p0; | |||
| int nalign = alignSize(nwrite, 4); | |||
| unsigned char padding[4] = {0x00, 0x00, 0x00, 0x00}; | |||
| fwrite(padding, sizeof(unsigned char), nalign - nwrite, bp); | |||
| return 0; | |||
| } | |||
| int NetOptimize::fwrite_weight_data(const ncnn::Mat& data, FILE* bp) | |||
| { | |||
| int p0 = ftell(bp); | |||
| ncnn::Mat data_flattened = data.reshape(data.w * data.h * data.c); | |||
| fwrite(data_flattened.data, data_flattened.elemsize, data_flattened.w, bp); | |||
| // padding to 32bit align | |||
| int nwrite = ftell(bp) - p0; | |||
| int nalign = alignSize(nwrite, 4); | |||
| unsigned char padding[4] = {0x00, 0x00, 0x00, 0x00}; | |||
| fwrite(padding, sizeof(unsigned char), nalign - nwrite, bp); | |||
| return 0; | |||
| } | |||
| @@ -1835,6 +1993,9 @@ int main(int argc, char** argv) | |||
| optimizer.eliminate_flatten_after_global_pooling(); | |||
| optimizer.replace_convolution_with_innerproduct_after_global_pooling(); | |||
| optimizer.replace_convolution_with_innerproduct_after_innerproduct(); | |||
| optimizer.eliminate_flatten_after_innerproduct(); | |||
| optimizer.save(outparam, outbin); | |||