Browse Source

group deconvolution packing family

tags/20190320
nihuini 7 years ago
parent
commit
43c4b57201
7 changed files with 891 additions and 95 deletions
  1. +4
    -7
      src/layer/convolutiondepthwise.cpp
  2. +392
    -87
      src/layer/deconvolutiondepthwise.cpp
  3. +15
    -1
      src/layer/deconvolutiondepthwise.h
  4. +117
    -0
      src/layer/shader/deconvolutiondepthwise_group.comp
  5. +121
    -0
      src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp
  6. +121
    -0
      src/layer/shader/deconvolutiondepthwise_group_pack4.comp
  7. +121
    -0
      src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp

+ 4
- 7
src/layer/convolutiondepthwise.cpp View File

@@ -997,10 +997,10 @@ int ConvolutionDepthWise::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
VkMat bottom_blob_bordered_unpacked = bottom_blob_bordered;
if (packing == 4 && channels_g % 4 != 0)
{
ncnn::Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;
ncnn::Option opt_pack1 = opt;
opt_pack1.blob_vkallocator = opt.workspace_vkallocator;

packing_pack1->forward(bottom_blob_bordered, bottom_blob_bordered_unpacked, cmd, opt_pad);
packing_pack1->forward(bottom_blob_bordered, bottom_blob_bordered_unpacked, cmd, opt_pack1);
}

VkMat top_blob_unpacked = top_blob;
@@ -1073,10 +1073,7 @@ int ConvolutionDepthWise::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
// packing
if (num_output_g % 4 != 0 && out_packing == 4)
{
ncnn::Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

packing_pack4->forward(top_blob_unpacked, top_blob, cmd, opt_pad);
packing_pack4->forward(top_blob_unpacked, top_blob, cmd, opt);
}
else
{


+ 392
- 87
src/layer/deconvolutiondepthwise.cpp View File

@@ -26,18 +26,24 @@ DeconvolutionDepthWise::DeconvolutionDepthWise()
support_vulkan = true;

#if NCNN_VULKAN
packing_pack1 = 0;
packing_pack4 = 0;

pipeline_deconvolutiondepthwise = 0;
pipeline_deconvolutiondepthwise_pack4 = 0;

pipeline_deconvolutiondepthwise_group = 0;
pipeline_deconvolutiondepthwise_group_pack4 = 0;
pipeline_deconvolutiondepthwise_group_pack1to4 = 0;
pipeline_deconvolutiondepthwise_group_pack4to1 = 0;
#endif // NCNN_VULKAN
}

DeconvolutionDepthWise::~DeconvolutionDepthWise()
{
#if NCNN_VULKAN
for (int i=0; i<(int)deconvolution_group_ops.size(); i++)
delete deconvolution_group_ops[i];

deconvolution_group_ops.clear();
delete packing_pack1;
delete packing_pack4;
#endif // NCNN_VULKAN
}

@@ -56,6 +62,33 @@ int DeconvolutionDepthWise::load_param(const ParamDict& pd)
weight_data_size = pd.get(6, 0);
group = pd.get(7, 1);

#if NCNN_VULKAN
if (pd.use_vulkan_compute)
{
{
packing_pack1 = ncnn::create_layer(ncnn::LayerType::Packing);
packing_pack1->vkdev = vkdev;

ncnn::ParamDict pd;
pd.set(0, 1);
pd.use_vulkan_compute = 1;

packing_pack1->load_param(pd);
}

{
packing_pack4 = ncnn::create_layer(ncnn::LayerType::Packing);
packing_pack4->vkdev = vkdev;

ncnn::ParamDict pd;
pd.set(0, 4);
pd.use_vulkan_compute = 1;

packing_pack4->load_param(pd);
}
}
#endif // NCNN_VULKAN

return 0;
}

@@ -72,65 +105,6 @@ int DeconvolutionDepthWise::load_model(const ModelBin& mb)
return -100;
}

#if NCNN_VULKAN
const int maxk = kernel_w * kernel_h;
int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

// group deconvolution
if (!(channels == group && group == num_output))
{
// create Deconvolution op for each group

for (int i=0; i<(int)deconvolution_group_ops.size(); i++)
delete deconvolution_group_ops[i];

deconvolution_group_ops.clear();

const int channels_g = channels / group;
const int num_output_g = num_output / group;

deconvolution_group_ops.resize(group);

for (int g=0; g<group; g++)
{
Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g);
Mat bias_data_g;
if (bias_term)
bias_data_g = bias_data.range(num_output_g * g, num_output_g);

ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
op->vkdev = vkdev;

// set param
ncnn::ParamDict pd;
pd.set(0, num_output_g);// num_output
pd.set(1, kernel_w);
pd.set(11, kernel_h);
pd.set(2, dilation_w);
pd.set(12, dilation_h);
pd.set(3, stride_w);
pd.set(13, stride_h);
pd.set(4, 0);// pad_w
pd.set(14, 0);// pad_h
pd.set(5, bias_term);
pd.set(6, maxk * channels_g * num_output_g);// weight_data_size

pd.use_vulkan_compute = 1;

op->load_param(pd);

// set weights
ncnn::Mat weights[2];
weights[0] = weight_data_g;
weights[1] = bias_data_g;

op->load_model(ModelBinFromMatArray(weights));

deconvolution_group_ops[g] = op;
}
}
#endif // NCNN_VULKAN

return 0;
}

@@ -344,9 +318,210 @@ int DeconvolutionDepthWise::upload_model(VkTransfer& cmd)
return 0;
}

for (int g=0; g<group; g++)
// group deconvolution
cmd.record_upload(weight_data_transposed, weight_data_gpu);

if (bias_term)
{
deconvolution_group_ops[g]->upload_model(cmd);
cmd.record_upload(bias_data, bias_data_gpu);
}

const int channels_g = channels / group;
const int num_output_g = num_output / group;

// pack4
if (channels_g % 4 == 0 && num_output_g % 4 == 0)
{
// src = kw-kh-inch-outch
// dst = 4a-4b-kw-kh-inch/4a-outch/4b
Mat weight_data_pack4_groups;
{
Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);

weight_data_pack4_groups.create(16*maxk, channels_g/4, num_output_g/4 * group);

for (int g=0; g<group; g++)
{
const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);

Mat weight_data_pack4 = weight_data_pack4_groups.channel_range(num_output_g/4 * g, num_output_g/4);

for (int q=0; q+3<num_output_g; q+=4)
{
const Mat k0 = weight_data_r2.channel(q);
const Mat k1 = weight_data_r2.channel(q+1);
const Mat k2 = weight_data_r2.channel(q+2);
const Mat k3 = weight_data_r2.channel(q+3);

Mat g0 = weight_data_pack4.channel(q/4);

for (int p=0; p+3<channels_g; p+=4)
{
const float* k00 = k0.row(p);
const float* k01 = k0.row(p+1);
const float* k02 = k0.row(p+2);
const float* k03 = k0.row(p+3);

const float* k10 = k1.row(p);
const float* k11 = k1.row(p+1);
const float* k12 = k1.row(p+2);
const float* k13 = k1.row(p+3);

const float* k20 = k2.row(p);
const float* k21 = k2.row(p+1);
const float* k22 = k2.row(p+2);
const float* k23 = k2.row(p+3);

const float* k30 = k3.row(p);
const float* k31 = k3.row(p+1);
const float* k32 = k3.row(p+2);
const float* k33 = k3.row(p+3);

float* g00 = g0.row(p/4);

for (int k=0; k<maxk; k++)
{
g00[0] = k00[k];
g00[1] = k01[k];
g00[2] = k02[k];
g00[3] = k03[k];

g00[4] = k10[k];
g00[5] = k11[k];
g00[6] = k12[k];
g00[7] = k13[k];

g00[8] = k20[k];
g00[9] = k21[k];
g00[10] = k22[k];
g00[11] = k23[k];

g00[12] = k30[k];
g00[13] = k31[k];
g00[14] = k32[k];
g00[15] = k33[k];

g00 += 16;
}
}
}
}
}

weight_data_pack4_groups = weight_data_pack4_groups.reshape(16*maxk * (channels_g/4) * (num_output_g/4) * group);
cmd.record_upload(weight_data_pack4_groups, weight_data_gpu_pack4);
}

// pack1to4
if (channels_g % 4 != 0 && num_output_g % 4 == 0)
{
// src = kw-kh-inch-outch
// dst = 4b-kw-kh-inch-outch/4b
Mat weight_data_pack1to4_groups;
{
Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);

weight_data_pack1to4_groups.create(4*maxk, channels_g, num_output_g/4 * group);

for (int g=0; g<group; g++)
{
const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);

Mat weight_data_pack1to4 = weight_data_pack1to4_groups.channel_range(num_output_g/4 * g, num_output_g/4);

for (int q=0; q+3<num_output_g; q+=4)
{
const Mat k0 = weight_data_r2.channel(q);
const Mat k1 = weight_data_r2.channel(q+1);
const Mat k2 = weight_data_r2.channel(q+2);
const Mat k3 = weight_data_r2.channel(q+3);

Mat g0 = weight_data_pack1to4.channel(q/4);

for (int p=0; p<channels_g; p++)
{
const float* k00 = k0.row(p);
const float* k10 = k1.row(p);
const float* k20 = k2.row(p);
const float* k30 = k3.row(p);

float* g00 = g0.row(p);

for (int k=0; k<maxk; k++)
{
g00[0] = k00[k];
g00[1] = k10[k];
g00[2] = k20[k];
g00[3] = k30[k];

g00 += 4;
}
}
}
}
}

weight_data_pack1to4_groups = weight_data_pack1to4_groups.reshape(4*maxk * channels_g * (num_output_g/4) * group);
cmd.record_upload(weight_data_pack1to4_groups, weight_data_gpu_pack1to4);
}

// pack4to1
if (channels_g % 4 == 0 && num_output_g % 4 != 0)
{
// src = kw-kh-inch-outch
// dst = 4a-kw-kh-inch/4a-outch
Mat weight_data_pack4to1_groups;
{
Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);

weight_data_pack4to1_groups.create(4*maxk, channels_g/4, num_output_g * group);

for (int g=0; g<group; g++)
{
const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);

Mat weight_data_pack4to1 = weight_data_pack4to1_groups.channel_range(num_output_g * g, num_output_g);

for (int q=0; q<num_output_g; q++)
{
const Mat k0 = weight_data_r2.channel(q);
Mat g0 = weight_data_pack4to1.channel(q);

for (int p=0; p+3<channels_g; p+=4)
{
const float* k00 = k0.row(p);
const float* k01 = k0.row(p+1);
const float* k02 = k0.row(p+2);
const float* k03 = k0.row(p+3);

float* g00 = g0.row(p/4);

for (int k=0; k<maxk; k++)
{
g00[0] = k00[k];
g00[1] = k01[k];
g00[2] = k02[k];
g00[3] = k03[k];

g00 += 4;
}
}
}
}
}

weight_data_pack4to1_groups = weight_data_pack4to1_groups.reshape(4*maxk * (channels_g/4) * num_output_g * group);
cmd.record_upload(weight_data_pack4to1_groups, weight_data_gpu_pack4to1);
}

if (num_output_g % 4 == 0)
{
if (bias_term)
{
Mat bias_data_pack4;
convert_packing(bias_data, bias_data_pack4, 4);
cmd.record_upload(bias_data_pack4, bias_data_gpu_pack4);
}
}

return 0;
@@ -386,9 +561,57 @@ int DeconvolutionDepthWise::create_pipeline()
return 0;
}

for (int g=0; g<group; g++)
const int channels_g = channels / group;
const int num_output_g = num_output / group;

// group deconvolution
pipeline_deconvolutiondepthwise_group = new Pipeline(vkdev);
pipeline_deconvolutiondepthwise_group->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8));

std::vector<vk_specialization_type> specializations(8);
specializations[0].i = kernel_w;
specializations[1].i = kernel_h;
specializations[2].i = dilation_w;
specializations[3].i = dilation_h;
specializations[4].i = stride_w;
specializations[5].i = stride_h;
specializations[6].i = bias_term;
specializations[7].i = group;

pipeline_deconvolutiondepthwise_group->create("deconvolutiondepthwise_group", specializations, 4, 10);

// pack4
if (channels_g % 4 == 0 && num_output_g % 4 == 0)
{
pipeline_deconvolutiondepthwise_group_pack4 = new Pipeline(vkdev);
pipeline_deconvolutiondepthwise_group_pack4->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8));
pipeline_deconvolutiondepthwise_group_pack4->create("deconvolutiondepthwise_group_pack4", specializations, 4, 10);
}

// pack1to4
if (channels_g % 4 != 0 && num_output_g % 4 == 0)
{
pipeline_deconvolutiondepthwise_group_pack1to4 = new Pipeline(vkdev);
pipeline_deconvolutiondepthwise_group_pack1to4->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8));
pipeline_deconvolutiondepthwise_group_pack1to4->create("deconvolutiondepthwise_group_pack1to4", specializations, 4, 10);
}

// pack4to1
if (channels_g % 4 == 0 && num_output_g % 4 != 0)
{
pipeline_deconvolutiondepthwise_group_pack4to1 = new Pipeline(vkdev);
pipeline_deconvolutiondepthwise_group_pack4to1->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8));
pipeline_deconvolutiondepthwise_group_pack4to1->create("deconvolutiondepthwise_group_pack4to1", specializations, 4, 10);
}

if (channels % 4 == 0 && channels_g % 4 != 0)
{
deconvolution_group_ops[g]->create_pipeline();
packing_pack1->create_pipeline();
}

if (num_output_g % 4 != 0 && num_output % 4 == 0)
{
packing_pack4->create_pipeline();
}

return 0;
@@ -396,10 +619,11 @@ int DeconvolutionDepthWise::create_pipeline()

int DeconvolutionDepthWise::destroy_pipeline()
{
for (int g=0; g<(int)deconvolution_group_ops.size(); g++)
{
deconvolution_group_ops[g]->destroy_pipeline();
}
if (packing_pack1)
packing_pack1->destroy_pipeline();

if (packing_pack4)
packing_pack4->destroy_pipeline();

delete pipeline_deconvolutiondepthwise;
pipeline_deconvolutiondepthwise = 0;
@@ -407,6 +631,18 @@ int DeconvolutionDepthWise::destroy_pipeline()
delete pipeline_deconvolutiondepthwise_pack4;
pipeline_deconvolutiondepthwise_pack4 = 0;

delete pipeline_deconvolutiondepthwise_group;
pipeline_deconvolutiondepthwise_group = 0;

delete pipeline_deconvolutiondepthwise_group_pack4;
pipeline_deconvolutiondepthwise_group_pack4 = 0;

delete pipeline_deconvolutiondepthwise_group_pack1to4;
pipeline_deconvolutiondepthwise_group_pack1to4 = 0;

delete pipeline_deconvolutiondepthwise_group_pack4to1;
pipeline_deconvolutiondepthwise_group_pack4to1 = 0;

return 0;
}

@@ -423,10 +659,10 @@ int DeconvolutionDepthWise::forward(const VkMat& bottom_blob, VkMat& top_blob, V

int outw = (w - 1) * stride_w + kernel_extent_w;
int outh = (h - 1) * stride_h + kernel_extent_h;
int out_packing = num_output % 4 == 0 ? 4 : 1;
size_t out_elemsize = elemsize / packing * out_packing;

// TODO assert num_output % packing == 0

top_blob.create(outw, outh, num_output / packing, elemsize, packing, opt.blob_vkallocator, opt.staging_vkallocator);
top_blob.create(outw, outh, num_output / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob.empty())
return -100;

@@ -463,25 +699,94 @@ int DeconvolutionDepthWise::forward(const VkMat& bottom_blob, VkMat& top_blob, V
return 0;
}

// record
cmd.record_prepare_compute_barrier(top_blob);
const int channels_g = channels * packing / group;
const int num_output_g = num_output / group;

const int channels_g = channels / group;
const int num_output_g = num_output / packing / group;
// unpacking
VkMat bottom_blob_unpacked = bottom_blob;
if (packing == 4 && channels_g % 4 != 0)
{
ncnn::Option opt_pack1 = opt;
opt_pack1.blob_vkallocator = opt.workspace_vkallocator;

packing_pack1->forward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1);
}

VkMat top_blob_unpacked = top_blob;
if (num_output_g % 4 != 0 && out_packing == 4)
{
top_blob_unpacked.create(outw, outh, num_output, elemsize / packing, 1, opt.workspace_vkallocator, opt.staging_vkallocator);
if (top_blob_unpacked.empty())
return -100;
}

for (int g=0; g<group; g++)
std::vector<VkMat> bindings(4);
bindings[0] = bottom_blob_unpacked;
bindings[1] = top_blob_unpacked;
if (channels_g % 4 != 0 && num_output_g % 4 != 0)
{
bindings[2] = weight_data_gpu;
bindings[3] = bias_term ? bias_data_gpu : weight_data_gpu;// TODO use dummy buffer
}
else if (channels_g % 4 == 0 && num_output_g % 4 == 0)
{
bindings[2] = weight_data_gpu_pack4;
bindings[3] = bias_term ? bias_data_gpu_pack4 : weight_data_gpu_pack4;// TODO use dummy buffer
}
else if (channels_g % 4 != 0 && num_output_g % 4 == 0)
{
VkMat bottom_blob_bordered_g = bottom_blob.channel_range(channels_g * g, channels_g);
VkMat top_blob_g = top_blob.channel_range(num_output_g * g, num_output_g);
bindings[2] = weight_data_gpu_pack1to4;
bindings[3] = bias_term ? bias_data_gpu_pack4 : weight_data_gpu_pack1to4;// TODO use dummy buffer
}
else if (channels_g % 4 == 0 && num_output_g % 4 != 0)
{
bindings[2] = weight_data_gpu_pack4to1;
bindings[3] = bias_term ? bias_data_gpu : weight_data_gpu_pack4to1;// TODO use dummy buffer
}

const ncnn::Layer* op = deconvolution_group_ops[g];
std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob_unpacked.dims;
constants[1].i = bottom_blob_unpacked.w;
constants[2].i = bottom_blob_unpacked.h;
constants[3].i = bottom_blob_unpacked.c;
constants[4].i = bottom_blob_unpacked.cstep;
constants[5].i = top_blob_unpacked.dims;
constants[6].i = top_blob_unpacked.w;
constants[7].i = top_blob_unpacked.h;
constants[8].i = top_blob_unpacked.c;
constants[9].i = top_blob_unpacked.cstep;

const Pipeline* pipeline = 0;
if (channels_g % 4 != 0 && num_output_g % 4 != 0)
{
pipeline = pipeline_deconvolutiondepthwise_group;
}
else if (channels_g % 4 == 0 && num_output_g % 4 == 0)
{
pipeline = pipeline_deconvolutiondepthwise_group_pack4;
}
else if (channels_g % 4 != 0 && num_output_g % 4 == 0)
{
pipeline = pipeline_deconvolutiondepthwise_group_pack1to4;
}
else if (channels_g % 4 == 0 && num_output_g % 4 != 0)
{
pipeline = pipeline_deconvolutiondepthwise_group_pack4to1;
}

ncnn::Option opt_g = opt;
opt_g.blob_vkallocator = top_blob.allocator;
opt_g.staging_vkallocator = top_blob.staging_allocator;
// record
cmd.record_prepare_compute_barrier(bottom_blob_unpacked);
cmd.record_prepare_compute_barrier(top_blob_unpacked);
cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked);

// forward
op->forward(bottom_blob_bordered_g, top_blob_g, cmd, opt_g);
// packing
if (num_output_g % 4 != 0 && out_packing == 4)
{
packing_pack4->forward(top_blob_unpacked, top_blob, cmd, opt);
}
else
{
top_blob = top_blob_unpacked;
}

return 0;


+ 15
- 1
src/layer/deconvolutiondepthwise.h View File

@@ -64,13 +64,27 @@ public:
VkMat weight_data_gpu;
VkMat bias_data_gpu;

std::vector<ncnn::Layer*> deconvolution_group_ops;
ncnn::Layer* packing_pack1;
ncnn::Layer* packing_pack4;

Pipeline* pipeline_deconvolutiondepthwise;

// pack4
VkMat weight_data_gpu_pack4;
VkMat bias_data_gpu_pack4;

Pipeline* pipeline_deconvolutiondepthwise_pack4;

Pipeline* pipeline_deconvolutiondepthwise_group;
Pipeline* pipeline_deconvolutiondepthwise_group_pack4;

// pack1to4
VkMat weight_data_gpu_pack1to4;
Pipeline* pipeline_deconvolutiondepthwise_group_pack1to4;

// pack4to1
VkMat weight_data_gpu_pack4to1;
Pipeline* pipeline_deconvolutiondepthwise_group_pack4to1;
#endif // NCNN_VULKAN

};


+ 117
- 0
src/layer/shader/deconvolutiondepthwise_group.comp View File

@@ -0,0 +1,117 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int group = 1;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { float weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { float bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return;

float sum;

if (bias_term == 1)
{
sum = bias_data[gz];
}
else
{
sum = 0.f;
}

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

// group convolution
const int channels_g = p.c / group;
const int num_output_g = p.outc / group;

// group id
const int gg = gz / num_output_g;

int w_offset_0 = gz * channels_g * kernel_w * kernel_h;
int v_offset_0 = gg * channels_g * p.cstep;

for (int y = 0; y < kernel_h; y++)
{
int sys = (gy + y * dilation_h - (kernel_extent_h - 1));
if (sys % stride_h != 0)
continue;

int sy = sys / stride_h;
if (sy < 0 || sy >= p.h)
continue;

for (int x = 0; x < kernel_w; x++)
{
int sxs = (gx + x * dilation_w - (kernel_extent_w - 1));
if (sxs % stride_w != 0)
continue;

int sx = sxs / stride_w;
if (sx < 0 || sx >= p.w)
continue;

int v_offset = v_offset_0 + sy * p.w + sx;
int w_offset = w_offset_0 + y * kernel_w + x;

for (int z = 0; z < channels_g; z++)
{
sum += weight_data[w_offset] * bottom_blob_data[v_offset];

v_offset += p.cstep;
w_offset += kernel_w * kernel_h;
}
}
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sum;
}

+ 121
- 0
src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp View File

@@ -0,0 +1,121 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int group = 1;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { vec4 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { vec4 bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return;

vec4 sum;

if (bias_term == 1)
{
sum = bias_data[gz];
}
else
{
sum = vec4(0.f);
}

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

// group convolution
const int channels_g = p.c / group;
const int num_output_g = p.outc / group;

// group id
const int gg = gz / num_output_g;

int w_offset_0 = gz * channels_g * kernel_w * kernel_h;
int v_offset_0 = gg * channels_g * p.cstep;

for (int y = 0; y < kernel_h; y++)
{
int sys = (gy + y * dilation_h - (kernel_extent_h - 1));
if (sys % stride_h != 0)
continue;

int sy = sys / stride_h;
if (sy < 0 || sy >= p.h)
continue;

for (int x = 0; x < kernel_w; x++)
{
int sxs = (gx + x * dilation_w - (kernel_extent_w - 1));
if (sxs % stride_w != 0)
continue;

int sx = sxs / stride_w;
if (sx < 0 || sx >= p.w)
continue;

int v_offset = v_offset_0 + sy * p.w + sx;
int w_offset = w_offset_0 + y * kernel_w + x;

for (int z = 0; z < channels_g; z++)
{
float v = bottom_blob_data[v_offset];

vec4 k = weight_data[w_offset];

sum += v * k;

v_offset += p.cstep;
w_offset += kernel_w * kernel_h;
}
}
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sum;
}

+ 121
- 0
src/layer/shader/deconvolutiondepthwise_group_pack4.comp View File

@@ -0,0 +1,121 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int group = 1;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { mat4 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { vec4 bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return;

vec4 sum;

if (bias_term == 1)
{
sum = bias_data[gz];
}
else
{
sum = vec4(0.f);
}

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

// group convolution
const int channels_g = p.c / group;
const int num_output_g = p.outc / group;

// group id
const int gg = gz / num_output_g;

int w_offset_0 = gz * channels_g * kernel_w * kernel_h;
int v_offset_0 = gg * channels_g * p.cstep;

for (int y = 0; y < kernel_h; y++)
{
int sys = (gy + y * dilation_h - (kernel_extent_h - 1));
if (sys % stride_h != 0)
continue;

int sy = sys / stride_h;
if (sy < 0 || sy >= p.h)
continue;

for (int x = 0; x < kernel_w; x++)
{
int sxs = (gx + x * dilation_w - (kernel_extent_w - 1));
if (sxs % stride_w != 0)
continue;

int sx = sxs / stride_w;
if (sx < 0 || sx >= p.w)
continue;

int v_offset = v_offset_0 + sy * p.w + sx;
int w_offset = w_offset_0 + y * kernel_w + x;

for (int z = 0; z < channels_g; z++)
{
vec4 v = bottom_blob_data[v_offset];

mat4 k = weight_data[w_offset];

sum += v * k;

v_offset += p.cstep;
w_offset += kernel_w * kernel_h;
}
}
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sum;
}

+ 121
- 0
src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp View File

@@ -0,0 +1,121 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int group = 1;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { vec4 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { float bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return;

float sum;

if (bias_term == 1)
{
sum = bias_data[gz];
}
else
{
sum = 0.f;
}

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

// group convolution
const int channels_g = p.c / group;
const int num_output_g = p.outc / group;

// group id
const int gg = gz / num_output_g;

int w_offset_0 = gz * channels_g * kernel_w * kernel_h;
int v_offset_0 = gg * channels_g * p.cstep;

for (int y = 0; y < kernel_h; y++)
{
int sys = (gy + y * dilation_h - (kernel_extent_h - 1));
if (sys % stride_h != 0)
continue;

int sy = sys / stride_h;
if (sy < 0 || sy >= p.h)
continue;

for (int x = 0; x < kernel_w; x++)
{
int sxs = (gx + x * dilation_w - (kernel_extent_w - 1));
if (sxs % stride_w != 0)
continue;

int sx = sxs / stride_w;
if (sx < 0 || sx >= p.w)
continue;

int v_offset = v_offset_0 + sy * p.w + sx;
int w_offset = w_offset_0 + y * kernel_w + x;

for (int z = 0; z < channels_g; z++)
{
vec4 v = bottom_blob_data[v_offset];

vec4 k = weight_data[w_offset];

sum += dot(v, k);

v_offset += p.cstep;
w_offset += kernel_w * kernel_h;
}
}
}

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sum;
}

Loading…
Cancel
Save