group deconvolution packing family

7 years ago · 43c4b57201
--- a/src/layer/convolutiondepthwise.cpp
+++ b/src/layer/convolutiondepthwise.cpp
@@ -997,10 +997,10 @@ int ConvolutionDepthWise::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
    VkMat bottom_blob_bordered_unpacked = bottom_blob_bordered;
    if (packing == 4 && channels_g % 4 != 0)
    {
        ncnn::Option opt_pad = opt;
        opt_pad.blob_vkallocator = opt.workspace_vkallocator;
        ncnn::Option opt_pack1 = opt;
        opt_pack1.blob_vkallocator = opt.workspace_vkallocator;

        packing_pack1->forward(bottom_blob_bordered, bottom_blob_bordered_unpacked, cmd, opt_pad);
        packing_pack1->forward(bottom_blob_bordered, bottom_blob_bordered_unpacked, cmd, opt_pack1);
    }

    VkMat top_blob_unpacked = top_blob;
@@ -1073,10 +1073,7 @@ int ConvolutionDepthWise::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
    // packing
    if (num_output_g % 4 != 0 && out_packing == 4)
    {
        ncnn::Option opt_pad = opt;
        opt_pad.blob_vkallocator = opt.workspace_vkallocator;

        packing_pack4->forward(top_blob_unpacked, top_blob, cmd, opt_pad);
        packing_pack4->forward(top_blob_unpacked, top_blob, cmd, opt);
    }
    else
    {
--- a/src/layer/deconvolutiondepthwise.cpp
+++ b/src/layer/deconvolutiondepthwise.cpp
@@ -26,18 +26,24 @@ DeconvolutionDepthWise::DeconvolutionDepthWise()
    support_vulkan = true;

 #if NCNN_VULKAN
    packing_pack1 = 0;
    packing_pack4 = 0;

    pipeline_deconvolutiondepthwise = 0;
    pipeline_deconvolutiondepthwise_pack4 = 0;

    pipeline_deconvolutiondepthwise_group = 0;
    pipeline_deconvolutiondepthwise_group_pack4 = 0;
    pipeline_deconvolutiondepthwise_group_pack1to4 = 0;
    pipeline_deconvolutiondepthwise_group_pack4to1 = 0;
 #endif // NCNN_VULKAN
 }

 DeconvolutionDepthWise::~DeconvolutionDepthWise()
 {
 #if NCNN_VULKAN
    for (int i=0; i<(int)deconvolution_group_ops.size(); i++)
        delete deconvolution_group_ops[i];

    deconvolution_group_ops.clear();
    delete packing_pack1;
    delete packing_pack4;
 #endif // NCNN_VULKAN
 }

@@ -56,6 +62,33 @@ int DeconvolutionDepthWise::load_param(const ParamDict& pd)
    weight_data_size = pd.get(6, 0);
    group = pd.get(7, 1);

 #if NCNN_VULKAN
    if (pd.use_vulkan_compute)
    {
        {
        packing_pack1 = ncnn::create_layer(ncnn::LayerType::Packing);
        packing_pack1->vkdev = vkdev;

        ncnn::ParamDict pd;
        pd.set(0, 1);
        pd.use_vulkan_compute = 1;

        packing_pack1->load_param(pd);
        }

        {
        packing_pack4 = ncnn::create_layer(ncnn::LayerType::Packing);
        packing_pack4->vkdev = vkdev;

        ncnn::ParamDict pd;
        pd.set(0, 4);
        pd.use_vulkan_compute = 1;

        packing_pack4->load_param(pd);
        }
    }
 #endif // NCNN_VULKAN

    return 0;
 }

@@ -72,65 +105,6 @@ int DeconvolutionDepthWise::load_model(const ModelBin& mb)
            return -100;
    }

 #if NCNN_VULKAN
    const int maxk = kernel_w * kernel_h;
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

    // group deconvolution
    if (!(channels == group && group == num_output))
    {
        // create Deconvolution op for each group

        for (int i=0; i<(int)deconvolution_group_ops.size(); i++)
            delete deconvolution_group_ops[i];

        deconvolution_group_ops.clear();

        const int channels_g = channels / group;
        const int num_output_g = num_output / group;

        deconvolution_group_ops.resize(group);

        for (int g=0; g<group; g++)
        {
            Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g);
            Mat bias_data_g;
            if (bias_term)
                bias_data_g = bias_data.range(num_output_g * g, num_output_g);

            ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
            op->vkdev = vkdev;

            // set param
            ncnn::ParamDict pd;
            pd.set(0, num_output_g);// num_output
            pd.set(1, kernel_w);
            pd.set(11, kernel_h);
            pd.set(2, dilation_w);
            pd.set(12, dilation_h);
            pd.set(3, stride_w);
            pd.set(13, stride_h);
            pd.set(4, 0);// pad_w
            pd.set(14, 0);// pad_h
            pd.set(5, bias_term);
            pd.set(6, maxk * channels_g * num_output_g);// weight_data_size

            pd.use_vulkan_compute = 1;

            op->load_param(pd);

            // set weights
            ncnn::Mat weights[2];
            weights[0] = weight_data_g;
            weights[1] = bias_data_g;

            op->load_model(ModelBinFromMatArray(weights));

            deconvolution_group_ops[g] = op;
        }
    }
 #endif // NCNN_VULKAN

    return 0;
 }

@@ -344,9 +318,210 @@ int DeconvolutionDepthWise::upload_model(VkTransfer& cmd)
        return 0;
    }

    for (int g=0; g<group; g++)
    // group deconvolution
    cmd.record_upload(weight_data_transposed, weight_data_gpu);

    if (bias_term)
    {
        deconvolution_group_ops[g]->upload_model(cmd);
        cmd.record_upload(bias_data, bias_data_gpu);
    }

    const int channels_g = channels / group;
    const int num_output_g = num_output / group;

    // pack4
    if (channels_g % 4 == 0 && num_output_g % 4 == 0)
    {
        // src = kw-kh-inch-outch
        // dst = 4a-4b-kw-kh-inch/4a-outch/4b
        Mat weight_data_pack4_groups;
        {
            Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);

            weight_data_pack4_groups.create(16*maxk, channels_g/4, num_output_g/4 * group);

            for (int g=0; g<group; g++)
            {
                const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);

                Mat weight_data_pack4 = weight_data_pack4_groups.channel_range(num_output_g/4 * g, num_output_g/4);

                for (int q=0; q+3<num_output_g; q+=4)
                {
                    const Mat k0 = weight_data_r2.channel(q);
                    const Mat k1 = weight_data_r2.channel(q+1);
                    const Mat k2 = weight_data_r2.channel(q+2);
                    const Mat k3 = weight_data_r2.channel(q+3);

                    Mat g0 = weight_data_pack4.channel(q/4);

                    for (int p=0; p+3<channels_g; p+=4)
                    {
                        const float* k00 = k0.row(p);
                        const float* k01 = k0.row(p+1);
                        const float* k02 = k0.row(p+2);
                        const float* k03 = k0.row(p+3);

                        const float* k10 = k1.row(p);
                        const float* k11 = k1.row(p+1);
                        const float* k12 = k1.row(p+2);
                        const float* k13 = k1.row(p+3);

                        const float* k20 = k2.row(p);
                        const float* k21 = k2.row(p+1);
                        const float* k22 = k2.row(p+2);
                        const float* k23 = k2.row(p+3);

                        const float* k30 = k3.row(p);
                        const float* k31 = k3.row(p+1);
                        const float* k32 = k3.row(p+2);
                        const float* k33 = k3.row(p+3);

                        float* g00 = g0.row(p/4);

                        for (int k=0; k<maxk; k++)
                        {
                            g00[0] = k00[k];
                            g00[1] = k01[k];
                            g00[2] = k02[k];
                            g00[3] = k03[k];

                            g00[4] = k10[k];
                            g00[5] = k11[k];
                            g00[6] = k12[k];
                            g00[7] = k13[k];

                            g00[8] = k20[k];
                            g00[9] = k21[k];
                            g00[10] = k22[k];
                            g00[11] = k23[k];

                            g00[12] = k30[k];
                            g00[13] = k31[k];
                            g00[14] = k32[k];
                            g00[15] = k33[k];

                            g00 += 16;
                        }
                    }
                }
            }
        }

        weight_data_pack4_groups = weight_data_pack4_groups.reshape(16*maxk * (channels_g/4) * (num_output_g/4) * group);
        cmd.record_upload(weight_data_pack4_groups, weight_data_gpu_pack4);
    }

    // pack1to4
    if (channels_g % 4 != 0 && num_output_g % 4 == 0)
    {
        // src = kw-kh-inch-outch
        // dst = 4b-kw-kh-inch-outch/4b
        Mat weight_data_pack1to4_groups;
        {
            Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);

            weight_data_pack1to4_groups.create(4*maxk, channels_g, num_output_g/4 * group);

            for (int g=0; g<group; g++)
            {
                const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);

                Mat weight_data_pack1to4 = weight_data_pack1to4_groups.channel_range(num_output_g/4 * g, num_output_g/4);

                for (int q=0; q+3<num_output_g; q+=4)
                {
                    const Mat k0 = weight_data_r2.channel(q);
                    const Mat k1 = weight_data_r2.channel(q+1);
                    const Mat k2 = weight_data_r2.channel(q+2);
                    const Mat k3 = weight_data_r2.channel(q+3);

                    Mat g0 = weight_data_pack1to4.channel(q/4);

                    for (int p=0; p<channels_g; p++)
                    {
                        const float* k00 = k0.row(p);
                        const float* k10 = k1.row(p);
                        const float* k20 = k2.row(p);
                        const float* k30 = k3.row(p);

                        float* g00 = g0.row(p);

                        for (int k=0; k<maxk; k++)
                        {
                            g00[0] = k00[k];
                            g00[1] = k10[k];
                            g00[2] = k20[k];
                            g00[3] = k30[k];

                            g00 += 4;
                        }
                    }
                }
            }
        }

        weight_data_pack1to4_groups = weight_data_pack1to4_groups.reshape(4*maxk * channels_g * (num_output_g/4) * group);
        cmd.record_upload(weight_data_pack1to4_groups, weight_data_gpu_pack1to4);
    }

    // pack4to1
    if (channels_g % 4 == 0 && num_output_g % 4 != 0)
    {
        // src = kw-kh-inch-outch
        // dst = 4a-kw-kh-inch/4a-outch
        Mat weight_data_pack4to1_groups;
        {
            Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);

            weight_data_pack4to1_groups.create(4*maxk, channels_g/4, num_output_g * group);

            for (int g=0; g<group; g++)
            {
                const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);

                Mat weight_data_pack4to1 = weight_data_pack4to1_groups.channel_range(num_output_g * g, num_output_g);

                for (int q=0; q<num_output_g; q++)
                {
                    const Mat k0 = weight_data_r2.channel(q);
                    Mat g0 = weight_data_pack4to1.channel(q);

                    for (int p=0; p+3<channels_g; p+=4)
                    {
                        const float* k00 = k0.row(p);
                        const float* k01 = k0.row(p+1);
                        const float* k02 = k0.row(p+2);
                        const float* k03 = k0.row(p+3);

                        float* g00 = g0.row(p/4);

                        for (int k=0; k<maxk; k++)
                        {
                            g00[0] = k00[k];
                            g00[1] = k01[k];
                            g00[2] = k02[k];
                            g00[3] = k03[k];

                            g00 += 4;
                        }
                    }
                }
            }
        }

        weight_data_pack4to1_groups = weight_data_pack4to1_groups.reshape(4*maxk * (channels_g/4) * num_output_g * group);
        cmd.record_upload(weight_data_pack4to1_groups, weight_data_gpu_pack4to1);
    }

    if (num_output_g % 4 == 0)
    {
        if (bias_term)
        {
            Mat bias_data_pack4;
            convert_packing(bias_data, bias_data_pack4, 4);
            cmd.record_upload(bias_data_pack4, bias_data_gpu_pack4);
        }
    }

    return 0;
@@ -386,9 +561,57 @@ int DeconvolutionDepthWise::create_pipeline()
        return 0;
    }

    for (int g=0; g<group; g++)
    const int channels_g = channels / group;
    const int num_output_g = num_output / group;

    // group deconvolution
    pipeline_deconvolutiondepthwise_group = new Pipeline(vkdev);
    pipeline_deconvolutiondepthwise_group->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8));

    std::vector<vk_specialization_type> specializations(8);
    specializations[0].i = kernel_w;
    specializations[1].i = kernel_h;
    specializations[2].i = dilation_w;
    specializations[3].i = dilation_h;
    specializations[4].i = stride_w;
    specializations[5].i = stride_h;
    specializations[6].i = bias_term;
    specializations[7].i = group;

    pipeline_deconvolutiondepthwise_group->create("deconvolutiondepthwise_group", specializations, 4, 10);

    // pack4
    if (channels_g % 4 == 0 && num_output_g % 4 == 0)
    {
        pipeline_deconvolutiondepthwise_group_pack4 = new Pipeline(vkdev);
        pipeline_deconvolutiondepthwise_group_pack4->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8));
        pipeline_deconvolutiondepthwise_group_pack4->create("deconvolutiondepthwise_group_pack4", specializations, 4, 10);
    }

    // pack1to4
    if (channels_g % 4 != 0 && num_output_g % 4 == 0)
    {
        pipeline_deconvolutiondepthwise_group_pack1to4 = new Pipeline(vkdev);
        pipeline_deconvolutiondepthwise_group_pack1to4->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8));
        pipeline_deconvolutiondepthwise_group_pack1to4->create("deconvolutiondepthwise_group_pack1to4", specializations, 4, 10);
    }

    // pack4to1
    if (channels_g % 4 == 0 && num_output_g % 4 != 0)
    {
        pipeline_deconvolutiondepthwise_group_pack4to1 = new Pipeline(vkdev);
        pipeline_deconvolutiondepthwise_group_pack4to1->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8));
        pipeline_deconvolutiondepthwise_group_pack4to1->create("deconvolutiondepthwise_group_pack4to1", specializations, 4, 10);
    }

    if (channels % 4 == 0 && channels_g % 4 != 0)
    {
        deconvolution_group_ops[g]->create_pipeline();
        packing_pack1->create_pipeline();
    }

    if (num_output_g % 4 != 0 && num_output % 4 == 0)
    {
        packing_pack4->create_pipeline();
    }

    return 0;
@@ -396,10 +619,11 @@ int DeconvolutionDepthWise::create_pipeline()

 int DeconvolutionDepthWise::destroy_pipeline()
 {
    for (int g=0; g<(int)deconvolution_group_ops.size(); g++)
    {
        deconvolution_group_ops[g]->destroy_pipeline();
    }
    if (packing_pack1)
        packing_pack1->destroy_pipeline();

    if (packing_pack4)
        packing_pack4->destroy_pipeline();

    delete pipeline_deconvolutiondepthwise;
    pipeline_deconvolutiondepthwise = 0;
@@ -407,6 +631,18 @@ int DeconvolutionDepthWise::destroy_pipeline()
    delete pipeline_deconvolutiondepthwise_pack4;
    pipeline_deconvolutiondepthwise_pack4 = 0;

    delete pipeline_deconvolutiondepthwise_group;
    pipeline_deconvolutiondepthwise_group = 0;

    delete pipeline_deconvolutiondepthwise_group_pack4;
    pipeline_deconvolutiondepthwise_group_pack4 = 0;

    delete pipeline_deconvolutiondepthwise_group_pack1to4;
    pipeline_deconvolutiondepthwise_group_pack1to4 = 0;

    delete pipeline_deconvolutiondepthwise_group_pack4to1;
    pipeline_deconvolutiondepthwise_group_pack4to1 = 0;

    return 0;
 }

@@ -423,10 +659,10 @@ int DeconvolutionDepthWise::forward(const VkMat& bottom_blob, VkMat& top_blob, V

    int outw = (w - 1) * stride_w + kernel_extent_w;
    int outh = (h - 1) * stride_h + kernel_extent_h;
    int out_packing = num_output % 4 == 0 ? 4 : 1;
    size_t out_elemsize = elemsize / packing * out_packing;

    // TODO assert num_output % packing == 0

    top_blob.create(outw, outh, num_output / packing, elemsize, packing, opt.blob_vkallocator, opt.staging_vkallocator);
    top_blob.create(outw, outh, num_output / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
    if (top_blob.empty())
        return -100;

@@ -463,25 +699,94 @@ int DeconvolutionDepthWise::forward(const VkMat& bottom_blob, VkMat& top_blob, V
        return 0;
    }

    // record
    cmd.record_prepare_compute_barrier(top_blob);
    const int channels_g = channels * packing / group;
    const int num_output_g = num_output / group;

    const int channels_g = channels / group;
    const int num_output_g = num_output / packing / group;
    // unpacking
    VkMat bottom_blob_unpacked = bottom_blob;
    if (packing == 4 && channels_g % 4 != 0)
    {
        ncnn::Option opt_pack1 = opt;
        opt_pack1.blob_vkallocator = opt.workspace_vkallocator;

        packing_pack1->forward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1);
    }

    VkMat top_blob_unpacked = top_blob;
    if (num_output_g % 4 != 0 && out_packing == 4)
    {
        top_blob_unpacked.create(outw, outh, num_output, elemsize / packing, 1, opt.workspace_vkallocator, opt.staging_vkallocator);
        if (top_blob_unpacked.empty())
            return -100;
    }

    for (int g=0; g<group; g++)
    std::vector<VkMat> bindings(4);
    bindings[0] = bottom_blob_unpacked;
    bindings[1] = top_blob_unpacked;
    if (channels_g % 4 != 0 && num_output_g % 4 != 0)
    {
        bindings[2] = weight_data_gpu;
        bindings[3] = bias_term ? bias_data_gpu : weight_data_gpu;// TODO use dummy buffer
    }
    else if (channels_g % 4 == 0 && num_output_g % 4 == 0)
    {
        bindings[2] = weight_data_gpu_pack4;
        bindings[3] = bias_term ? bias_data_gpu_pack4 : weight_data_gpu_pack4;// TODO use dummy buffer
    }
    else if (channels_g % 4 != 0 && num_output_g % 4 == 0)
    {
        VkMat bottom_blob_bordered_g = bottom_blob.channel_range(channels_g * g, channels_g);
        VkMat top_blob_g = top_blob.channel_range(num_output_g * g, num_output_g);
        bindings[2] = weight_data_gpu_pack1to4;
        bindings[3] = bias_term ? bias_data_gpu_pack4 : weight_data_gpu_pack1to4;// TODO use dummy buffer
    }
    else if (channels_g % 4 == 0 && num_output_g % 4 != 0)
    {
        bindings[2] = weight_data_gpu_pack4to1;
        bindings[3] = bias_term ? bias_data_gpu : weight_data_gpu_pack4to1;// TODO use dummy buffer
    }

        const ncnn::Layer* op = deconvolution_group_ops[g];
    std::vector<vk_constant_type> constants(10);
    constants[0].i = bottom_blob_unpacked.dims;
    constants[1].i = bottom_blob_unpacked.w;
    constants[2].i = bottom_blob_unpacked.h;
    constants[3].i = bottom_blob_unpacked.c;
    constants[4].i = bottom_blob_unpacked.cstep;
    constants[5].i = top_blob_unpacked.dims;
    constants[6].i = top_blob_unpacked.w;
    constants[7].i = top_blob_unpacked.h;
    constants[8].i = top_blob_unpacked.c;
    constants[9].i = top_blob_unpacked.cstep;

    const Pipeline* pipeline = 0;
    if (channels_g % 4 != 0 && num_output_g % 4 != 0)
    {
        pipeline = pipeline_deconvolutiondepthwise_group;
    }
    else if (channels_g % 4 == 0 && num_output_g % 4 == 0)
    {
        pipeline = pipeline_deconvolutiondepthwise_group_pack4;
    }
    else if (channels_g % 4 != 0 && num_output_g % 4 == 0)
    {
        pipeline = pipeline_deconvolutiondepthwise_group_pack1to4;
    }
    else if (channels_g % 4 == 0 && num_output_g % 4 != 0)
    {
        pipeline = pipeline_deconvolutiondepthwise_group_pack4to1;
    }

        ncnn::Option opt_g = opt;
        opt_g.blob_vkallocator = top_blob.allocator;
        opt_g.staging_vkallocator = top_blob.staging_allocator;
    // record
    cmd.record_prepare_compute_barrier(bottom_blob_unpacked);
    cmd.record_prepare_compute_barrier(top_blob_unpacked);
    cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked);

        // forward
        op->forward(bottom_blob_bordered_g, top_blob_g, cmd, opt_g);
    // packing
    if (num_output_g % 4 != 0 && out_packing == 4)
    {
        packing_pack4->forward(top_blob_unpacked, top_blob, cmd, opt);
    }
    else
    {
        top_blob = top_blob_unpacked;
    }

    return 0;
--- a/src/layer/deconvolutiondepthwise.h
+++ b/src/layer/deconvolutiondepthwise.h
@@ -64,13 +64,27 @@ public:
    VkMat weight_data_gpu;
    VkMat bias_data_gpu;

    std::vector<ncnn::Layer*> deconvolution_group_ops;
    ncnn::Layer* packing_pack1;
    ncnn::Layer* packing_pack4;

    Pipeline* pipeline_deconvolutiondepthwise;

    // pack4
    VkMat weight_data_gpu_pack4;
    VkMat bias_data_gpu_pack4;

    Pipeline* pipeline_deconvolutiondepthwise_pack4;

    Pipeline* pipeline_deconvolutiondepthwise_group;
    Pipeline* pipeline_deconvolutiondepthwise_group_pack4;

    // pack1to4
    VkMat weight_data_gpu_pack1to4;
    Pipeline* pipeline_deconvolutiondepthwise_group_pack1to4;

    // pack4to1
    VkMat weight_data_gpu_pack4to1;
    Pipeline* pipeline_deconvolutiondepthwise_group_pack4to1;
 #endif // NCNN_VULKAN

 };
--- a/src/layer/shader/deconvolutiondepthwise_group.comp
+++ b/src/layer/shader/deconvolutiondepthwise_group.comp
@@ -0,0 +1,117 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
 layout (constant_id = 2) const int dilation_w = 1;
 layout (constant_id = 3) const int dilation_h = 1;
 layout (constant_id = 4) const int stride_w = 1;
 layout (constant_id = 5) const int stride_h = 1;
 layout (constant_id = 6) const int bias_term = 0;
 layout (constant_id = 7) const int group = 1;

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; };
 layout (binding = 2) readonly buffer weight_blob { float weight_data[]; };
 layout (binding = 3) readonly buffer bias_blob { float bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
        return;

    float sum;

    if (bias_term == 1)
    {
        sum = bias_data[gz];
    }
    else
    {
        sum = 0.f;
    }

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    // group convolution
    const int channels_g = p.c / group;
    const int num_output_g = p.outc / group;

    // group id
    const int gg = gz / num_output_g;

    int w_offset_0 = gz * channels_g * kernel_w * kernel_h;
    int v_offset_0 = gg * channels_g * p.cstep;

    for (int y = 0; y < kernel_h; y++)
    {
        int sys = (gy + y * dilation_h - (kernel_extent_h - 1));
        if (sys % stride_h != 0)
            continue;

        int sy = sys / stride_h;
        if (sy < 0 || sy >= p.h)
            continue;

        for (int x = 0; x < kernel_w; x++)
        {
            int sxs = (gx + x * dilation_w - (kernel_extent_w - 1));
            if (sxs % stride_w != 0)
                continue;

            int sx = sxs / stride_w;
            if (sx < 0 || sx >= p.w)
                continue;

            int v_offset = v_offset_0 + sy * p.w + sx;
            int w_offset = w_offset_0 + y * kernel_w + x;

            for (int z = 0; z < channels_g; z++)
            {
                sum += weight_data[w_offset] * bottom_blob_data[v_offset];

                v_offset += p.cstep;
                w_offset += kernel_w * kernel_h;
            }
        }
    }

    top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sum;
 }
--- a/src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp
+++ b/src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp
@@ -0,0 +1,121 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
 layout (constant_id = 2) const int dilation_w = 1;
 layout (constant_id = 3) const int dilation_h = 1;
 layout (constant_id = 4) const int stride_w = 1;
 layout (constant_id = 5) const int stride_h = 1;
 layout (constant_id = 6) const int bias_term = 0;
 layout (constant_id = 7) const int group = 1;

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; };
 layout (binding = 2) readonly buffer weight_blob { vec4 weight_data[]; };
 layout (binding = 3) readonly buffer bias_blob { vec4 bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
        return;

    vec4 sum;

    if (bias_term == 1)
    {
        sum = bias_data[gz];
    }
    else
    {
        sum = vec4(0.f);
    }

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    // group convolution
    const int channels_g = p.c / group;
    const int num_output_g = p.outc / group;

    // group id
    const int gg = gz / num_output_g;

    int w_offset_0 = gz * channels_g * kernel_w * kernel_h;
    int v_offset_0 = gg * channels_g * p.cstep;

    for (int y = 0; y < kernel_h; y++)
    {
        int sys = (gy + y * dilation_h - (kernel_extent_h - 1));
        if (sys % stride_h != 0)
            continue;

        int sy = sys / stride_h;
        if (sy < 0 || sy >= p.h)
            continue;

        for (int x = 0; x < kernel_w; x++)
        {
            int sxs = (gx + x * dilation_w - (kernel_extent_w - 1));
            if (sxs % stride_w != 0)
                continue;

            int sx = sxs / stride_w;
            if (sx < 0 || sx >= p.w)
                continue;

            int v_offset = v_offset_0 + sy * p.w + sx;
            int w_offset = w_offset_0 + y * kernel_w + x;

            for (int z = 0; z < channels_g; z++)
            {
                float v = bottom_blob_data[v_offset];

                vec4 k = weight_data[w_offset];

                sum += v * k;

                v_offset += p.cstep;
                w_offset += kernel_w * kernel_h;
            }
        }
    }

    top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sum;
 }
--- a/src/layer/shader/deconvolutiondepthwise_group_pack4.comp
+++ b/src/layer/shader/deconvolutiondepthwise_group_pack4.comp
@@ -0,0 +1,121 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
 layout (constant_id = 2) const int dilation_w = 1;
 layout (constant_id = 3) const int dilation_h = 1;
 layout (constant_id = 4) const int stride_w = 1;
 layout (constant_id = 5) const int stride_h = 1;
 layout (constant_id = 6) const int bias_term = 0;
 layout (constant_id = 7) const int group = 1;

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; };
 layout (binding = 2) readonly buffer weight_blob { mat4 weight_data[]; };
 layout (binding = 3) readonly buffer bias_blob { vec4 bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
        return;

    vec4 sum;

    if (bias_term == 1)
    {
        sum = bias_data[gz];
    }
    else
    {
        sum = vec4(0.f);
    }

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    // group convolution
    const int channels_g = p.c / group;
    const int num_output_g = p.outc / group;

    // group id
    const int gg = gz / num_output_g;

    int w_offset_0 = gz * channels_g * kernel_w * kernel_h;
    int v_offset_0 = gg * channels_g * p.cstep;

    for (int y = 0; y < kernel_h; y++)
    {
        int sys = (gy + y * dilation_h - (kernel_extent_h - 1));
        if (sys % stride_h != 0)
            continue;

        int sy = sys / stride_h;
        if (sy < 0 || sy >= p.h)
            continue;

        for (int x = 0; x < kernel_w; x++)
        {
            int sxs = (gx + x * dilation_w - (kernel_extent_w - 1));
            if (sxs % stride_w != 0)
                continue;

            int sx = sxs / stride_w;
            if (sx < 0 || sx >= p.w)
                continue;

            int v_offset = v_offset_0 + sy * p.w + sx;
            int w_offset = w_offset_0 + y * kernel_w + x;

            for (int z = 0; z < channels_g; z++)
            {
                vec4 v = bottom_blob_data[v_offset];

                mat4 k = weight_data[w_offset];

                sum += v * k;

                v_offset += p.cstep;
                w_offset += kernel_w * kernel_h;
            }
        }
    }

    top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sum;
 }
--- a/src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp
+++ b/src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp
@@ -0,0 +1,121 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
 layout (constant_id = 2) const int dilation_w = 1;
 layout (constant_id = 3) const int dilation_h = 1;
 layout (constant_id = 4) const int stride_w = 1;
 layout (constant_id = 5) const int stride_h = 1;
 layout (constant_id = 6) const int bias_term = 0;
 layout (constant_id = 7) const int group = 1;

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; };
 layout (binding = 2) readonly buffer weight_blob { vec4 weight_data[]; };
 layout (binding = 3) readonly buffer bias_blob { float bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
        return;

    float sum;

    if (bias_term == 1)
    {
        sum = bias_data[gz];
    }
    else
    {
        sum = 0.f;
    }

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    // group convolution
    const int channels_g = p.c / group;
    const int num_output_g = p.outc / group;

    // group id
    const int gg = gz / num_output_g;

    int w_offset_0 = gz * channels_g * kernel_w * kernel_h;
    int v_offset_0 = gg * channels_g * p.cstep;

    for (int y = 0; y < kernel_h; y++)
    {
        int sys = (gy + y * dilation_h - (kernel_extent_h - 1));
        if (sys % stride_h != 0)
            continue;

        int sy = sys / stride_h;
        if (sy < 0 || sy >= p.h)
            continue;

        for (int x = 0; x < kernel_w; x++)
        {
            int sxs = (gx + x * dilation_w - (kernel_extent_w - 1));
            if (sxs % stride_w != 0)
                continue;

            int sx = sxs / stride_w;
            if (sx < 0 || sx >= p.w)
                continue;

            int v_offset = v_offset_0 + sy * p.w + sx;
            int w_offset = w_offset_0 + y * kernel_w + x;

            for (int z = 0; z < channels_g; z++)
            {
                vec4 v = bottom_blob_data[v_offset];

                vec4 k = weight_data[w_offset];

                sum += dot(v, k);

                v_offset += p.cstep;
                w_offset += kernel_w * kernel_h;
            }
        }
    }

    top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sum;
 }