From 43c4b57201810c9c8aef5055f07498bb5cc1f735 Mon Sep 17 00:00:00 2001 From: nihuini Date: Fri, 22 Feb 2019 19:46:00 +0800 Subject: [PATCH] group deconvolution packing family --- src/layer/convolutiondepthwise.cpp | 11 +- src/layer/deconvolutiondepthwise.cpp | 479 ++++++++++++++---- src/layer/deconvolutiondepthwise.h | 16 +- .../shader/deconvolutiondepthwise_group.comp | 117 +++++ ...deconvolutiondepthwise_group_pack1to4.comp | 121 +++++ .../deconvolutiondepthwise_group_pack4.comp | 121 +++++ ...deconvolutiondepthwise_group_pack4to1.comp | 121 +++++ 7 files changed, 891 insertions(+), 95 deletions(-) create mode 100644 src/layer/shader/deconvolutiondepthwise_group.comp create mode 100644 src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp create mode 100644 src/layer/shader/deconvolutiondepthwise_group_pack4.comp create mode 100644 src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp diff --git a/src/layer/convolutiondepthwise.cpp b/src/layer/convolutiondepthwise.cpp index 47ab4249a..ff5bddf3d 100644 --- a/src/layer/convolutiondepthwise.cpp +++ b/src/layer/convolutiondepthwise.cpp @@ -997,10 +997,10 @@ int ConvolutionDepthWise::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC VkMat bottom_blob_bordered_unpacked = bottom_blob_bordered; if (packing == 4 && channels_g % 4 != 0) { - ncnn::Option opt_pad = opt; - opt_pad.blob_vkallocator = opt.workspace_vkallocator; + ncnn::Option opt_pack1 = opt; + opt_pack1.blob_vkallocator = opt.workspace_vkallocator; - packing_pack1->forward(bottom_blob_bordered, bottom_blob_bordered_unpacked, cmd, opt_pad); + packing_pack1->forward(bottom_blob_bordered, bottom_blob_bordered_unpacked, cmd, opt_pack1); } VkMat top_blob_unpacked = top_blob; @@ -1073,10 +1073,7 @@ int ConvolutionDepthWise::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC // packing if (num_output_g % 4 != 0 && out_packing == 4) { - ncnn::Option opt_pad = opt; - opt_pad.blob_vkallocator = opt.workspace_vkallocator; - - packing_pack4->forward(top_blob_unpacked, top_blob, cmd, opt_pad); + packing_pack4->forward(top_blob_unpacked, top_blob, cmd, opt); } else { diff --git a/src/layer/deconvolutiondepthwise.cpp b/src/layer/deconvolutiondepthwise.cpp index 19773c1a4..b7c76abb0 100644 --- a/src/layer/deconvolutiondepthwise.cpp +++ b/src/layer/deconvolutiondepthwise.cpp @@ -26,18 +26,24 @@ DeconvolutionDepthWise::DeconvolutionDepthWise() support_vulkan = true; #if NCNN_VULKAN + packing_pack1 = 0; + packing_pack4 = 0; + pipeline_deconvolutiondepthwise = 0; pipeline_deconvolutiondepthwise_pack4 = 0; + + pipeline_deconvolutiondepthwise_group = 0; + pipeline_deconvolutiondepthwise_group_pack4 = 0; + pipeline_deconvolutiondepthwise_group_pack1to4 = 0; + pipeline_deconvolutiondepthwise_group_pack4to1 = 0; #endif // NCNN_VULKAN } DeconvolutionDepthWise::~DeconvolutionDepthWise() { #if NCNN_VULKAN - for (int i=0; i<(int)deconvolution_group_ops.size(); i++) - delete deconvolution_group_ops[i]; - - deconvolution_group_ops.clear(); + delete packing_pack1; + delete packing_pack4; #endif // NCNN_VULKAN } @@ -56,6 +62,33 @@ int DeconvolutionDepthWise::load_param(const ParamDict& pd) weight_data_size = pd.get(6, 0); group = pd.get(7, 1); +#if NCNN_VULKAN + if (pd.use_vulkan_compute) + { + { + packing_pack1 = ncnn::create_layer(ncnn::LayerType::Packing); + packing_pack1->vkdev = vkdev; + + ncnn::ParamDict pd; + pd.set(0, 1); + pd.use_vulkan_compute = 1; + + packing_pack1->load_param(pd); + } + + { + packing_pack4 = ncnn::create_layer(ncnn::LayerType::Packing); + packing_pack4->vkdev = vkdev; + + ncnn::ParamDict pd; + pd.set(0, 4); + pd.use_vulkan_compute = 1; + + packing_pack4->load_param(pd); + } + } +#endif // NCNN_VULKAN + return 0; } @@ -72,65 +105,6 @@ int DeconvolutionDepthWise::load_model(const ModelBin& mb) return -100; } -#if NCNN_VULKAN - const int maxk = kernel_w * kernel_h; - int channels = (weight_data_size / group) / maxk / (num_output / group) * group; - - // group deconvolution - if (!(channels == group && group == num_output)) - { - // create Deconvolution op for each group - - for (int i=0; i<(int)deconvolution_group_ops.size(); i++) - delete deconvolution_group_ops[i]; - - deconvolution_group_ops.clear(); - - const int channels_g = channels / group; - const int num_output_g = num_output / group; - - deconvolution_group_ops.resize(group); - - for (int g=0; gvkdev = vkdev; - - // set param - ncnn::ParamDict pd; - pd.set(0, num_output_g);// num_output - pd.set(1, kernel_w); - pd.set(11, kernel_h); - pd.set(2, dilation_w); - pd.set(12, dilation_h); - pd.set(3, stride_w); - pd.set(13, stride_h); - pd.set(4, 0);// pad_w - pd.set(14, 0);// pad_h - pd.set(5, bias_term); - pd.set(6, maxk * channels_g * num_output_g);// weight_data_size - - pd.use_vulkan_compute = 1; - - op->load_param(pd); - - // set weights - ncnn::Mat weights[2]; - weights[0] = weight_data_g; - weights[1] = bias_data_g; - - op->load_model(ModelBinFromMatArray(weights)); - - deconvolution_group_ops[g] = op; - } - } -#endif // NCNN_VULKAN - return 0; } @@ -344,9 +318,210 @@ int DeconvolutionDepthWise::upload_model(VkTransfer& cmd) return 0; } - for (int g=0; gupload_model(cmd); + cmd.record_upload(bias_data, bias_data_gpu); + } + + const int channels_g = channels / group; + const int num_output_g = num_output / group; + + // pack4 + if (channels_g % 4 == 0 && num_output_g % 4 == 0) + { + // src = kw-kh-inch-outch + // dst = 4a-4b-kw-kh-inch/4a-outch/4b + Mat weight_data_pack4_groups; + { + Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group); + + weight_data_pack4_groups.create(16*maxk, channels_g/4, num_output_g/4 * group); + + for (int g=0; gset_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8)); + + std::vector specializations(8); + specializations[0].i = kernel_w; + specializations[1].i = kernel_h; + specializations[2].i = dilation_w; + specializations[3].i = dilation_h; + specializations[4].i = stride_w; + specializations[5].i = stride_h; + specializations[6].i = bias_term; + specializations[7].i = group; + + pipeline_deconvolutiondepthwise_group->create("deconvolutiondepthwise_group", specializations, 4, 10); + + // pack4 + if (channels_g % 4 == 0 && num_output_g % 4 == 0) + { + pipeline_deconvolutiondepthwise_group_pack4 = new Pipeline(vkdev); + pipeline_deconvolutiondepthwise_group_pack4->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8)); + pipeline_deconvolutiondepthwise_group_pack4->create("deconvolutiondepthwise_group_pack4", specializations, 4, 10); + } + + // pack1to4 + if (channels_g % 4 != 0 && num_output_g % 4 == 0) + { + pipeline_deconvolutiondepthwise_group_pack1to4 = new Pipeline(vkdev); + pipeline_deconvolutiondepthwise_group_pack1to4->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8)); + pipeline_deconvolutiondepthwise_group_pack1to4->create("deconvolutiondepthwise_group_pack1to4", specializations, 4, 10); + } + + // pack4to1 + if (channels_g % 4 == 0 && num_output_g % 4 != 0) + { + pipeline_deconvolutiondepthwise_group_pack4to1 = new Pipeline(vkdev); + pipeline_deconvolutiondepthwise_group_pack4to1->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8)); + pipeline_deconvolutiondepthwise_group_pack4to1->create("deconvolutiondepthwise_group_pack4to1", specializations, 4, 10); + } + + if (channels % 4 == 0 && channels_g % 4 != 0) { - deconvolution_group_ops[g]->create_pipeline(); + packing_pack1->create_pipeline(); + } + + if (num_output_g % 4 != 0 && num_output % 4 == 0) + { + packing_pack4->create_pipeline(); } return 0; @@ -396,10 +619,11 @@ int DeconvolutionDepthWise::create_pipeline() int DeconvolutionDepthWise::destroy_pipeline() { - for (int g=0; g<(int)deconvolution_group_ops.size(); g++) - { - deconvolution_group_ops[g]->destroy_pipeline(); - } + if (packing_pack1) + packing_pack1->destroy_pipeline(); + + if (packing_pack4) + packing_pack4->destroy_pipeline(); delete pipeline_deconvolutiondepthwise; pipeline_deconvolutiondepthwise = 0; @@ -407,6 +631,18 @@ int DeconvolutionDepthWise::destroy_pipeline() delete pipeline_deconvolutiondepthwise_pack4; pipeline_deconvolutiondepthwise_pack4 = 0; + delete pipeline_deconvolutiondepthwise_group; + pipeline_deconvolutiondepthwise_group = 0; + + delete pipeline_deconvolutiondepthwise_group_pack4; + pipeline_deconvolutiondepthwise_group_pack4 = 0; + + delete pipeline_deconvolutiondepthwise_group_pack1to4; + pipeline_deconvolutiondepthwise_group_pack1to4 = 0; + + delete pipeline_deconvolutiondepthwise_group_pack4to1; + pipeline_deconvolutiondepthwise_group_pack4to1 = 0; + return 0; } @@ -423,10 +659,10 @@ int DeconvolutionDepthWise::forward(const VkMat& bottom_blob, VkMat& top_blob, V int outw = (w - 1) * stride_w + kernel_extent_w; int outh = (h - 1) * stride_h + kernel_extent_h; + int out_packing = num_output % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / packing * out_packing; - // TODO assert num_output % packing == 0 - - top_blob.create(outw, outh, num_output / packing, elemsize, packing, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(outw, outh, num_output / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); if (top_blob.empty()) return -100; @@ -463,25 +699,94 @@ int DeconvolutionDepthWise::forward(const VkMat& bottom_blob, VkMat& top_blob, V return 0; } - // record - cmd.record_prepare_compute_barrier(top_blob); + const int channels_g = channels * packing / group; + const int num_output_g = num_output / group; - const int channels_g = channels / group; - const int num_output_g = num_output / packing / group; + // unpacking + VkMat bottom_blob_unpacked = bottom_blob; + if (packing == 4 && channels_g % 4 != 0) + { + ncnn::Option opt_pack1 = opt; + opt_pack1.blob_vkallocator = opt.workspace_vkallocator; + + packing_pack1->forward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1); + } + + VkMat top_blob_unpacked = top_blob; + if (num_output_g % 4 != 0 && out_packing == 4) + { + top_blob_unpacked.create(outw, outh, num_output, elemsize / packing, 1, opt.workspace_vkallocator, opt.staging_vkallocator); + if (top_blob_unpacked.empty()) + return -100; + } - for (int g=0; g bindings(4); + bindings[0] = bottom_blob_unpacked; + bindings[1] = top_blob_unpacked; + if (channels_g % 4 != 0 && num_output_g % 4 != 0) + { + bindings[2] = weight_data_gpu; + bindings[3] = bias_term ? bias_data_gpu : weight_data_gpu;// TODO use dummy buffer + } + else if (channels_g % 4 == 0 && num_output_g % 4 == 0) + { + bindings[2] = weight_data_gpu_pack4; + bindings[3] = bias_term ? bias_data_gpu_pack4 : weight_data_gpu_pack4;// TODO use dummy buffer + } + else if (channels_g % 4 != 0 && num_output_g % 4 == 0) { - VkMat bottom_blob_bordered_g = bottom_blob.channel_range(channels_g * g, channels_g); - VkMat top_blob_g = top_blob.channel_range(num_output_g * g, num_output_g); + bindings[2] = weight_data_gpu_pack1to4; + bindings[3] = bias_term ? bias_data_gpu_pack4 : weight_data_gpu_pack1to4;// TODO use dummy buffer + } + else if (channels_g % 4 == 0 && num_output_g % 4 != 0) + { + bindings[2] = weight_data_gpu_pack4to1; + bindings[3] = bias_term ? bias_data_gpu : weight_data_gpu_pack4to1;// TODO use dummy buffer + } - const ncnn::Layer* op = deconvolution_group_ops[g]; + std::vector constants(10); + constants[0].i = bottom_blob_unpacked.dims; + constants[1].i = bottom_blob_unpacked.w; + constants[2].i = bottom_blob_unpacked.h; + constants[3].i = bottom_blob_unpacked.c; + constants[4].i = bottom_blob_unpacked.cstep; + constants[5].i = top_blob_unpacked.dims; + constants[6].i = top_blob_unpacked.w; + constants[7].i = top_blob_unpacked.h; + constants[8].i = top_blob_unpacked.c; + constants[9].i = top_blob_unpacked.cstep; + + const Pipeline* pipeline = 0; + if (channels_g % 4 != 0 && num_output_g % 4 != 0) + { + pipeline = pipeline_deconvolutiondepthwise_group; + } + else if (channels_g % 4 == 0 && num_output_g % 4 == 0) + { + pipeline = pipeline_deconvolutiondepthwise_group_pack4; + } + else if (channels_g % 4 != 0 && num_output_g % 4 == 0) + { + pipeline = pipeline_deconvolutiondepthwise_group_pack1to4; + } + else if (channels_g % 4 == 0 && num_output_g % 4 != 0) + { + pipeline = pipeline_deconvolutiondepthwise_group_pack4to1; + } - ncnn::Option opt_g = opt; - opt_g.blob_vkallocator = top_blob.allocator; - opt_g.staging_vkallocator = top_blob.staging_allocator; + // record + cmd.record_prepare_compute_barrier(bottom_blob_unpacked); + cmd.record_prepare_compute_barrier(top_blob_unpacked); + cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked); - // forward - op->forward(bottom_blob_bordered_g, top_blob_g, cmd, opt_g); + // packing + if (num_output_g % 4 != 0 && out_packing == 4) + { + packing_pack4->forward(top_blob_unpacked, top_blob, cmd, opt); + } + else + { + top_blob = top_blob_unpacked; } return 0; diff --git a/src/layer/deconvolutiondepthwise.h b/src/layer/deconvolutiondepthwise.h index 8cab20802..e9713ad4b 100644 --- a/src/layer/deconvolutiondepthwise.h +++ b/src/layer/deconvolutiondepthwise.h @@ -64,13 +64,27 @@ public: VkMat weight_data_gpu; VkMat bias_data_gpu; - std::vector deconvolution_group_ops; + ncnn::Layer* packing_pack1; + ncnn::Layer* packing_pack4; Pipeline* pipeline_deconvolutiondepthwise; + // pack4 VkMat weight_data_gpu_pack4; VkMat bias_data_gpu_pack4; + Pipeline* pipeline_deconvolutiondepthwise_pack4; + + Pipeline* pipeline_deconvolutiondepthwise_group; + Pipeline* pipeline_deconvolutiondepthwise_group_pack4; + + // pack1to4 + VkMat weight_data_gpu_pack1to4; + Pipeline* pipeline_deconvolutiondepthwise_group_pack1to4; + + // pack4to1 + VkMat weight_data_gpu_pack4to1; + Pipeline* pipeline_deconvolutiondepthwise_group_pack4to1; #endif // NCNN_VULKAN }; diff --git a/src/layer/shader/deconvolutiondepthwise_group.comp b/src/layer/shader/deconvolutiondepthwise_group.comp new file mode 100644 index 000000000..a399010de --- /dev/null +++ b/src/layer/shader/deconvolutiondepthwise_group.comp @@ -0,0 +1,117 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { float weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { float bias_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + float sum; + + if (bias_term == 1) + { + sum = bias_data[gz]; + } + else + { + sum = 0.f; + } + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + // group convolution + const int channels_g = p.c / group; + const int num_output_g = p.outc / group; + + // group id + const int gg = gz / num_output_g; + + int w_offset_0 = gz * channels_g * kernel_w * kernel_h; + int v_offset_0 = gg * channels_g * p.cstep; + + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy < 0 || sy >= p.h) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx < 0 || sx >= p.w) + continue; + + int v_offset = v_offset_0 + sy * p.w + sx; + int w_offset = w_offset_0 + y * kernel_w + x; + + for (int z = 0; z < channels_g; z++) + { + sum += weight_data[w_offset] * bottom_blob_data[v_offset]; + + v_offset += p.cstep; + w_offset += kernel_w * kernel_h; + } + } + } + + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sum; +} diff --git a/src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp b/src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp new file mode 100644 index 000000000..8def83602 --- /dev/null +++ b/src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp @@ -0,0 +1,121 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { vec4 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { vec4 bias_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + vec4 sum; + + if (bias_term == 1) + { + sum = bias_data[gz]; + } + else + { + sum = vec4(0.f); + } + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + // group convolution + const int channels_g = p.c / group; + const int num_output_g = p.outc / group; + + // group id + const int gg = gz / num_output_g; + + int w_offset_0 = gz * channels_g * kernel_w * kernel_h; + int v_offset_0 = gg * channels_g * p.cstep; + + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy < 0 || sy >= p.h) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx < 0 || sx >= p.w) + continue; + + int v_offset = v_offset_0 + sy * p.w + sx; + int w_offset = w_offset_0 + y * kernel_w + x; + + for (int z = 0; z < channels_g; z++) + { + float v = bottom_blob_data[v_offset]; + + vec4 k = weight_data[w_offset]; + + sum += v * k; + + v_offset += p.cstep; + w_offset += kernel_w * kernel_h; + } + } + } + + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sum; +} diff --git a/src/layer/shader/deconvolutiondepthwise_group_pack4.comp b/src/layer/shader/deconvolutiondepthwise_group_pack4.comp new file mode 100644 index 000000000..65670c5c5 --- /dev/null +++ b/src/layer/shader/deconvolutiondepthwise_group_pack4.comp @@ -0,0 +1,121 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { mat4 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { vec4 bias_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + vec4 sum; + + if (bias_term == 1) + { + sum = bias_data[gz]; + } + else + { + sum = vec4(0.f); + } + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + // group convolution + const int channels_g = p.c / group; + const int num_output_g = p.outc / group; + + // group id + const int gg = gz / num_output_g; + + int w_offset_0 = gz * channels_g * kernel_w * kernel_h; + int v_offset_0 = gg * channels_g * p.cstep; + + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy < 0 || sy >= p.h) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx < 0 || sx >= p.w) + continue; + + int v_offset = v_offset_0 + sy * p.w + sx; + int w_offset = w_offset_0 + y * kernel_w + x; + + for (int z = 0; z < channels_g; z++) + { + vec4 v = bottom_blob_data[v_offset]; + + mat4 k = weight_data[w_offset]; + + sum += v * k; + + v_offset += p.cstep; + w_offset += kernel_w * kernel_h; + } + } + } + + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sum; +} diff --git a/src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp b/src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp new file mode 100644 index 000000000..4f5948b69 --- /dev/null +++ b/src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp @@ -0,0 +1,121 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { vec4 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { float bias_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + float sum; + + if (bias_term == 1) + { + sum = bias_data[gz]; + } + else + { + sum = 0.f; + } + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + // group convolution + const int channels_g = p.c / group; + const int num_output_g = p.outc / group; + + // group id + const int gg = gz / num_output_g; + + int w_offset_0 = gz * channels_g * kernel_w * kernel_h; + int v_offset_0 = gg * channels_g * p.cstep; + + for (int y = 0; y < kernel_h; y++) + { + int sys = (gy + y * dilation_h - (kernel_extent_h - 1)); + if (sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy < 0 || sy >= p.h) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (gx + x * dilation_w - (kernel_extent_w - 1)); + if (sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx < 0 || sx >= p.w) + continue; + + int v_offset = v_offset_0 + sy * p.w + sx; + int w_offset = w_offset_0 + y * kernel_w + x; + + for (int z = 0; z < channels_g; z++) + { + vec4 v = bottom_blob_data[v_offset]; + + vec4 k = weight_data[w_offset]; + + sum += dot(v, k); + + v_offset += p.cstep; + w_offset += kernel_w * kernel_h; + } + } + } + + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sum; +}