// Tencent is pleased to support the open source community by making ncnn available. // // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at // // https://opensource.org/licenses/BSD-3-Clause // // Unless required by applicable law or agreed to in writing, software distributed // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. #include "convolutiondepthwise_arm.h" #include "layer_type.h" #if __ARM_NEON #include #include "neon_mathfun.h" #endif // __ARM_NEON namespace ncnn { #include "convolutiondepthwise_3x3.h" #include "convolutiondepthwise_5x5.h" #include "convolutiondepthwise_3x3_int8.h" DEFINE_LAYER_CREATOR(ConvolutionDepthWise_arm) ConvolutionDepthWise_arm::ConvolutionDepthWise_arm() { activation = 0; } int ConvolutionDepthWise_arm::create_pipeline(const Option& opt) { Option opt_cpu = opt; opt_cpu.use_vulkan_compute = false; if (activation_type == 1) { activation = ncnn::create_layer(ncnn::LayerType::ReLU); ncnn::ParamDict pd; activation->load_param(pd); } else if (activation_type == 2) { activation = ncnn::create_layer(ncnn::LayerType::ReLU); ncnn::ParamDict pd; pd.set(0, activation_params[0]);// slope activation->load_param(pd); } else if (activation_type == 3) { activation = ncnn::create_layer(ncnn::LayerType::Clip); ncnn::ParamDict pd; pd.set(0, activation_params[0]);// min pd.set(1, activation_params[1]);// max activation->load_param(pd); } else if (activation_type == 4) { activation = ncnn::create_layer(ncnn::LayerType::Sigmoid); ncnn::ParamDict pd; activation->load_param(pd); } if (activation) { activation->create_pipeline(opt_cpu); } // create Convolution op for each group const int maxk = kernel_w * kernel_h; int channels = (weight_data_size / group) / maxk / (num_output / group) * group; if (opt.use_packing_layout) { // depth-wise if (channels == group && group == num_output) { // pack4 if (num_output % 4 == 0) { Mat weight_data_r2 = weight_data.reshape(maxk, group); convert_packing(weight_data_r2, weight_data_pack4, 4); } } // group convolution const int channels_g = channels / group; const int num_output_g = num_output / group; // pack4 if (channels_g % 4 == 0 && num_output_g % 4 == 0) { // src = kw-kh-inch-outch // dst = 4a-4b-kw-kh-inch/4a-outch/4b { Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group); weight_data_pack4_groups.create(maxk, channels_g/4, num_output_g/4 * group, (size_t)4*16, 16); for (int g=0; gload_param(pd); // set weights if (bias_term) { ncnn::Mat weights[4]; weights[0] = weight_data_g; weights[1] = bias_data_g; if (int8_scale_term) { weights[2] = weight_data_int8_scales.range(g, 1); weights[3] = bottom_blob_int8_scales.range(g, 1); } op->load_model(ModelBinFromMatArray(weights)); } else { ncnn::Mat weights[3]; weights[0] = weight_data_g; if (int8_scale_term) { weights[1] = weight_data_int8_scales.range(g, 1); weights[2] = bottom_blob_int8_scales.range(g, 1); } op->load_model(ModelBinFromMatArray(weights)); } op->create_pipeline(opt_cpu); group_ops[g] = op; } return 0; } int ConvolutionDepthWise_arm::destroy_pipeline(const Option& opt) { Option opt_cpu = opt; opt_cpu.use_vulkan_compute = false; if (activation) { activation->destroy_pipeline(opt_cpu); delete activation; activation = 0; } for (int i=0; i<(int)group_ops.size(); i++) { group_ops[i]->destroy_pipeline(opt_cpu); delete group_ops[i]; } group_ops.clear(); return 0; } int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { // convolv with NxN kernel // value = value + bias int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; size_t elemsize = bottom_blob.elemsize; int packing = bottom_blob.packing; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; Mat bottom_blob_unbordered = bottom_blob; if (use_int8_inference && elemsize != 1) { Mat bottom_blob_int8; bottom_blob_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator); if (bottom_blob_int8.empty()) return -100; const int channels_g = channels / group; // quantize, scale and round to nearest #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; gforward(bottom_blob_g, bottom_blob_int8_g, opt_g); } bottom_blob_unbordered = bottom_blob_int8; } Mat bottom_blob_bordered = bottom_blob_unbordered; if (pad_w > 0 || pad_h > 0) { copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, pad_h, pad_h, pad_w, pad_w, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; w = bottom_blob_bordered.w; h = bottom_blob_bordered.h; } else if (pad_w == -233 && pad_h == -233) { int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; if (wpad > 0 || hpad > 0) { copy_make_border(bottom_blob_unbordered, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f, opt.workspace_allocator, opt.num_threads); if (bottom_blob_bordered.empty()) return -100; } w = bottom_blob_bordered.w; h = bottom_blob_bordered.h; } int outw = (w - kernel_extent_w) / stride_w + 1; int outh = (h - kernel_extent_h) / stride_h + 1; int out_packing = num_output % 4 == 0 ? 4 : 1; size_t out_elemsize = elemsize / packing * out_packing; if (opt.use_packing_layout) { const int maxk = kernel_w * kernel_h; // kernel offsets std::vector _space_ofs(maxk); int* space_ofs = &_space_ofs[0]; { int p1 = 0; int p2 = 0; int gap = w * dilation_h - kernel_w * dilation_w; for (int i = 0; i < kernel_h; i++) { for (int j = 0; j < kernel_w; j++) { space_ofs[p1] = p2; p1++; p2 += dilation_w; } p2 += gap; } } top_blob.create(outw, outh, num_output / out_packing, out_elemsize, out_packing, opt.blob_allocator); if (top_blob.empty()) return -100; // depth-wise if (channels == group / packing && group / packing == num_output / packing) { if (packing == 4) { #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; g 0.f ? sum : sum * slope; } else if (activation_type == 3) { float min = activation_params[0]; float max = activation_params[1]; if (sum < min) sum = min; if (sum > max) sum = max; } else if (activation_type == 4) { sum = 1.f / (1.f + exp(-sum)); } outptr[j] = sum; } outptr += outw; } } } } // packing if (num_output_g % 4 != 0 && out_packing == 4) { convert_packing(top_blob_unpacked, top_blob, 4, opt.blob_allocator, opt.num_threads); } else { top_blob = top_blob_unpacked; } return 0; } // opt.use_packing_layout // int8 if (use_int8_inference) { if (use_int8_requantize) { Mat top_blob_tm; top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); if (top_blob_tm.empty()) return -100; top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator); if (top_blob.empty()) return -100; // depth-wise if (channels == group && group == num_output) { if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1) { if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2)) { if (stride_w == 1 && stride_h == 1) { convdw3x3s1_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, requantize_scales, opt); } else if (stride_w == 2 && stride_h == 2) { convdw3x3s2_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, requantize_scales, opt); } if (activation) { activation->forward_inplace(top_blob, opt); } return 0; } } #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; gforward(bottom_blob_bordered_g, top_blob_tm_g, opt_g); } if (activation) { activation->forward_inplace(top_blob, opt); } return 0; } const int channels_g = channels / group; const int num_output_g = num_output / group; #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; gforward(bottom_blob_bordered_g, top_blob_tm_g, opt_g); } } else { top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator); if (top_blob.empty()) return -100; // depth-wise if (channels == group && group == num_output) { if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1) { if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2)) { if (stride_w == 1 && stride_h == 1) { convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt); } else if (stride_w == 2 && stride_h == 2) { convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt); } // dequantize, reverse scale inplace #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; gforward_inplace(top_blob_g, opt_g); } if (activation) { activation->forward_inplace(top_blob, opt); } return 0; } } #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; gforward(bottom_blob_bordered_g, top_blob_g, opt_g); } if (activation) { activation->forward_inplace(top_blob, opt); } return 0; } const int channels_g = channels / group; const int num_output_g = num_output / group; #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; gforward(bottom_blob_bordered_g, top_blob_g, opt_g); } } if (activation) { activation->forward_inplace(top_blob, opt); } return 0; } // float32 top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator); if (top_blob.empty()) return -100; // depth-wise if (channels == group && group == num_output) { if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1) { if (stride_w == 1 && stride_h == 1) { convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); } else if (stride_w == 2 && stride_h == 2) { convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); } if (activation) { activation->forward_inplace(top_blob, opt); } return 0; } if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1) { if (stride_w == 1 && stride_h == 1) { convdw5x5s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); } else if (stride_w == 2 && stride_h == 2) { convdw5x5s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); } if (activation) { activation->forward_inplace(top_blob, opt); } return 0; } #pragma omp parallel for num_threads(opt.num_threads) for (int g=0; gforward(bottom_blob_bordered_g, top_blob_g, opt_g); } if (activation) { activation->forward_inplace(top_blob, opt); } return 0; } const int channels_g = channels / group; const int num_output_g = num_output / group; for (int g=0; gforward(bottom_blob_bordered_g, top_blob_g, opt_g); } if (activation) { activation->forward_inplace(top_blob, opt); } return 0; } } // namespace ncnn