From 43c4b57201810c9c8aef5055f07498bb5cc1f735 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Fri, 22 Feb 2019 19:46:00 +0800
Subject: [PATCH] group deconvolution packing family

---
 src/layer/convolutiondepthwise.cpp            |  11 +-
 src/layer/deconvolutiondepthwise.cpp          | 479 ++++++++++++++----
 src/layer/deconvolutiondepthwise.h            |  16 +-
 .../shader/deconvolutiondepthwise_group.comp  | 117 +++++
 ...deconvolutiondepthwise_group_pack1to4.comp | 121 +++++
 .../deconvolutiondepthwise_group_pack4.comp   | 121 +++++
 ...deconvolutiondepthwise_group_pack4to1.comp | 121 +++++
 7 files changed, 891 insertions(+), 95 deletions(-)
 create mode 100644 src/layer/shader/deconvolutiondepthwise_group.comp
 create mode 100644 src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp
 create mode 100644 src/layer/shader/deconvolutiondepthwise_group_pack4.comp
 create mode 100644 src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp

diff --git a/src/layer/convolutiondepthwise.cpp b/src/layer/convolutiondepthwise.cpp
index 47ab4249a..ff5bddf3d 100644
--- a/src/layer/convolutiondepthwise.cpp
+++ b/src/layer/convolutiondepthwise.cpp
@@ -997,10 +997,10 @@ int ConvolutionDepthWise::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
     VkMat bottom_blob_bordered_unpacked = bottom_blob_bordered;
     if (packing == 4 && channels_g % 4 != 0)
     {
-        ncnn::Option opt_pad = opt;
-        opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+        ncnn::Option opt_pack1 = opt;
+        opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
 
-        packing_pack1->forward(bottom_blob_bordered, bottom_blob_bordered_unpacked, cmd, opt_pad);
+        packing_pack1->forward(bottom_blob_bordered, bottom_blob_bordered_unpacked, cmd, opt_pack1);
     }
 
     VkMat top_blob_unpacked = top_blob;
@@ -1073,10 +1073,7 @@ int ConvolutionDepthWise::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
     // packing
     if (num_output_g % 4 != 0 && out_packing == 4)
     {
-        ncnn::Option opt_pad = opt;
-        opt_pad.blob_vkallocator = opt.workspace_vkallocator;
-
-        packing_pack4->forward(top_blob_unpacked, top_blob, cmd, opt_pad);
+        packing_pack4->forward(top_blob_unpacked, top_blob, cmd, opt);
     }
     else
     {
diff --git a/src/layer/deconvolutiondepthwise.cpp b/src/layer/deconvolutiondepthwise.cpp
index 19773c1a4..b7c76abb0 100644
--- a/src/layer/deconvolutiondepthwise.cpp
+++ b/src/layer/deconvolutiondepthwise.cpp
@@ -26,18 +26,24 @@ DeconvolutionDepthWise::DeconvolutionDepthWise()
     support_vulkan = true;
 
 #if NCNN_VULKAN
+    packing_pack1 = 0;
+    packing_pack4 = 0;
+
     pipeline_deconvolutiondepthwise = 0;
     pipeline_deconvolutiondepthwise_pack4 = 0;
+
+    pipeline_deconvolutiondepthwise_group = 0;
+    pipeline_deconvolutiondepthwise_group_pack4 = 0;
+    pipeline_deconvolutiondepthwise_group_pack1to4 = 0;
+    pipeline_deconvolutiondepthwise_group_pack4to1 = 0;
 #endif // NCNN_VULKAN
 }
 
 DeconvolutionDepthWise::~DeconvolutionDepthWise()
 {
 #if NCNN_VULKAN
-    for (int i=0; i<(int)deconvolution_group_ops.size(); i++)
-        delete deconvolution_group_ops[i];
-
-    deconvolution_group_ops.clear();
+    delete packing_pack1;
+    delete packing_pack4;
 #endif // NCNN_VULKAN
 }
 
@@ -56,6 +62,33 @@ int DeconvolutionDepthWise::load_param(const ParamDict& pd)
     weight_data_size = pd.get(6, 0);
     group = pd.get(7, 1);
 
+#if NCNN_VULKAN
+    if (pd.use_vulkan_compute)
+    {
+        {
+        packing_pack1 = ncnn::create_layer(ncnn::LayerType::Packing);
+        packing_pack1->vkdev = vkdev;
+
+        ncnn::ParamDict pd;
+        pd.set(0, 1);
+        pd.use_vulkan_compute = 1;
+
+        packing_pack1->load_param(pd);
+        }
+
+        {
+        packing_pack4 = ncnn::create_layer(ncnn::LayerType::Packing);
+        packing_pack4->vkdev = vkdev;
+
+        ncnn::ParamDict pd;
+        pd.set(0, 4);
+        pd.use_vulkan_compute = 1;
+
+        packing_pack4->load_param(pd);
+        }
+    }
+#endif // NCNN_VULKAN
+
     return 0;
 }
 
@@ -72,65 +105,6 @@ int DeconvolutionDepthWise::load_model(const ModelBin& mb)
             return -100;
     }
 
-#if NCNN_VULKAN
-    const int maxk = kernel_w * kernel_h;
-    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
-
-    // group deconvolution
-    if (!(channels == group && group == num_output))
-    {
-        // create Deconvolution op for each group
-
-        for (int i=0; i<(int)deconvolution_group_ops.size(); i++)
-            delete deconvolution_group_ops[i];
-
-        deconvolution_group_ops.clear();
-
-        const int channels_g = channels / group;
-        const int num_output_g = num_output / group;
-
-        deconvolution_group_ops.resize(group);
-
-        for (int g=0; g<group; g++)
-        {
-            Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g);
-            Mat bias_data_g;
-            if (bias_term)
-                bias_data_g = bias_data.range(num_output_g * g, num_output_g);
-
-            ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
-            op->vkdev = vkdev;
-
-            // set param
-            ncnn::ParamDict pd;
-            pd.set(0, num_output_g);// num_output
-            pd.set(1, kernel_w);
-            pd.set(11, kernel_h);
-            pd.set(2, dilation_w);
-            pd.set(12, dilation_h);
-            pd.set(3, stride_w);
-            pd.set(13, stride_h);
-            pd.set(4, 0);// pad_w
-            pd.set(14, 0);// pad_h
-            pd.set(5, bias_term);
-            pd.set(6, maxk * channels_g * num_output_g);// weight_data_size
-
-            pd.use_vulkan_compute = 1;
-
-            op->load_param(pd);
-
-            // set weights
-            ncnn::Mat weights[2];
-            weights[0] = weight_data_g;
-            weights[1] = bias_data_g;
-
-            op->load_model(ModelBinFromMatArray(weights));
-
-            deconvolution_group_ops[g] = op;
-        }
-    }
-#endif // NCNN_VULKAN
-
     return 0;
 }
 
@@ -344,9 +318,210 @@ int DeconvolutionDepthWise::upload_model(VkTransfer& cmd)
         return 0;
     }
 
-    for (int g=0; g<group; g++)
+    // group deconvolution
+    cmd.record_upload(weight_data_transposed, weight_data_gpu);
+
+    if (bias_term)
     {
-        deconvolution_group_ops[g]->upload_model(cmd);
+        cmd.record_upload(bias_data, bias_data_gpu);
+    }
+
+    const int channels_g = channels / group;
+    const int num_output_g = num_output / group;
+
+    // pack4
+    if (channels_g % 4 == 0 && num_output_g % 4 == 0)
+    {
+        // src = kw-kh-inch-outch
+        // dst = 4a-4b-kw-kh-inch/4a-outch/4b
+        Mat weight_data_pack4_groups;
+        {
+            Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);
+
+            weight_data_pack4_groups.create(16*maxk, channels_g/4, num_output_g/4 * group);
+
+            for (int g=0; g<group; g++)
+            {
+                const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);
+
+                Mat weight_data_pack4 = weight_data_pack4_groups.channel_range(num_output_g/4 * g, num_output_g/4);
+
+                for (int q=0; q+3<num_output_g; q+=4)
+                {
+                    const Mat k0 = weight_data_r2.channel(q);
+                    const Mat k1 = weight_data_r2.channel(q+1);
+                    const Mat k2 = weight_data_r2.channel(q+2);
+                    const Mat k3 = weight_data_r2.channel(q+3);
+
+                    Mat g0 = weight_data_pack4.channel(q/4);
+
+                    for (int p=0; p+3<channels_g; p+=4)
+                    {
+                        const float* k00 = k0.row(p);
+                        const float* k01 = k0.row(p+1);
+                        const float* k02 = k0.row(p+2);
+                        const float* k03 = k0.row(p+3);
+
+                        const float* k10 = k1.row(p);
+                        const float* k11 = k1.row(p+1);
+                        const float* k12 = k1.row(p+2);
+                        const float* k13 = k1.row(p+3);
+
+                        const float* k20 = k2.row(p);
+                        const float* k21 = k2.row(p+1);
+                        const float* k22 = k2.row(p+2);
+                        const float* k23 = k2.row(p+3);
+
+                        const float* k30 = k3.row(p);
+                        const float* k31 = k3.row(p+1);
+                        const float* k32 = k3.row(p+2);
+                        const float* k33 = k3.row(p+3);
+
+                        float* g00 = g0.row(p/4);
+
+                        for (int k=0; k<maxk; k++)
+                        {
+                            g00[0] = k00[k];
+                            g00[1] = k01[k];
+                            g00[2] = k02[k];
+                            g00[3] = k03[k];
+
+                            g00[4] = k10[k];
+                            g00[5] = k11[k];
+                            g00[6] = k12[k];
+                            g00[7] = k13[k];
+
+                            g00[8] = k20[k];
+                            g00[9] = k21[k];
+                            g00[10] = k22[k];
+                            g00[11] = k23[k];
+
+                            g00[12] = k30[k];
+                            g00[13] = k31[k];
+                            g00[14] = k32[k];
+                            g00[15] = k33[k];
+
+                            g00 += 16;
+                        }
+                    }
+                }
+            }
+        }
+
+        weight_data_pack4_groups = weight_data_pack4_groups.reshape(16*maxk * (channels_g/4) * (num_output_g/4) * group);
+        cmd.record_upload(weight_data_pack4_groups, weight_data_gpu_pack4);
+    }
+
+    // pack1to4
+    if (channels_g % 4 != 0 && num_output_g % 4 == 0)
+    {
+        // src = kw-kh-inch-outch
+        // dst = 4b-kw-kh-inch-outch/4b
+        Mat weight_data_pack1to4_groups;
+        {
+            Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);
+
+            weight_data_pack1to4_groups.create(4*maxk, channels_g, num_output_g/4 * group);
+
+            for (int g=0; g<group; g++)
+            {
+                const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);
+
+                Mat weight_data_pack1to4 = weight_data_pack1to4_groups.channel_range(num_output_g/4 * g, num_output_g/4);
+
+                for (int q=0; q+3<num_output_g; q+=4)
+                {
+                    const Mat k0 = weight_data_r2.channel(q);
+                    const Mat k1 = weight_data_r2.channel(q+1);
+                    const Mat k2 = weight_data_r2.channel(q+2);
+                    const Mat k3 = weight_data_r2.channel(q+3);
+
+                    Mat g0 = weight_data_pack1to4.channel(q/4);
+
+                    for (int p=0; p<channels_g; p++)
+                    {
+                        const float* k00 = k0.row(p);
+                        const float* k10 = k1.row(p);
+                        const float* k20 = k2.row(p);
+                        const float* k30 = k3.row(p);
+
+                        float* g00 = g0.row(p);
+
+                        for (int k=0; k<maxk; k++)
+                        {
+                            g00[0] = k00[k];
+                            g00[1] = k10[k];
+                            g00[2] = k20[k];
+                            g00[3] = k30[k];
+
+                            g00 += 4;
+                        }
+                    }
+                }
+            }
+        }
+
+        weight_data_pack1to4_groups = weight_data_pack1to4_groups.reshape(4*maxk * channels_g * (num_output_g/4) * group);
+        cmd.record_upload(weight_data_pack1to4_groups, weight_data_gpu_pack1to4);
+    }
+
+    // pack4to1
+    if (channels_g % 4 == 0 && num_output_g % 4 != 0)
+    {
+        // src = kw-kh-inch-outch
+        // dst = 4a-kw-kh-inch/4a-outch
+        Mat weight_data_pack4to1_groups;
+        {
+            Mat weight_data_r2_groups = weight_data_transposed.reshape(maxk, channels_g, num_output_g * group);
+
+            weight_data_pack4to1_groups.create(4*maxk, channels_g/4, num_output_g * group);
+
+            for (int g=0; g<group; g++)
+            {
+                const Mat weight_data_r2 = weight_data_r2_groups.channel_range(num_output_g * g, num_output_g);
+
+                Mat weight_data_pack4to1 = weight_data_pack4to1_groups.channel_range(num_output_g * g, num_output_g);
+
+                for (int q=0; q<num_output_g; q++)
+                {
+                    const Mat k0 = weight_data_r2.channel(q);
+                    Mat g0 = weight_data_pack4to1.channel(q);
+
+                    for (int p=0; p+3<channels_g; p+=4)
+                    {
+                        const float* k00 = k0.row(p);
+                        const float* k01 = k0.row(p+1);
+                        const float* k02 = k0.row(p+2);
+                        const float* k03 = k0.row(p+3);
+
+                        float* g00 = g0.row(p/4);
+
+                        for (int k=0; k<maxk; k++)
+                        {
+                            g00[0] = k00[k];
+                            g00[1] = k01[k];
+                            g00[2] = k02[k];
+                            g00[3] = k03[k];
+
+                            g00 += 4;
+                        }
+                    }
+                }
+            }
+        }
+
+        weight_data_pack4to1_groups = weight_data_pack4to1_groups.reshape(4*maxk * (channels_g/4) * num_output_g * group);
+        cmd.record_upload(weight_data_pack4to1_groups, weight_data_gpu_pack4to1);
+    }
+
+    if (num_output_g % 4 == 0)
+    {
+        if (bias_term)
+        {
+            Mat bias_data_pack4;
+            convert_packing(bias_data, bias_data_pack4, 4);
+            cmd.record_upload(bias_data_pack4, bias_data_gpu_pack4);
+        }
     }
 
     return 0;
@@ -386,9 +561,57 @@ int DeconvolutionDepthWise::create_pipeline()
         return 0;
     }
 
-    for (int g=0; g<group; g++)
+    const int channels_g = channels / group;
+    const int num_output_g = num_output / group;
+
+    // group deconvolution
+    pipeline_deconvolutiondepthwise_group = new Pipeline(vkdev);
+    pipeline_deconvolutiondepthwise_group->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8));
+
+    std::vector<vk_specialization_type> specializations(8);
+    specializations[0].i = kernel_w;
+    specializations[1].i = kernel_h;
+    specializations[2].i = dilation_w;
+    specializations[3].i = dilation_h;
+    specializations[4].i = stride_w;
+    specializations[5].i = stride_h;
+    specializations[6].i = bias_term;
+    specializations[7].i = group;
+
+    pipeline_deconvolutiondepthwise_group->create("deconvolutiondepthwise_group", specializations, 4, 10);
+
+    // pack4
+    if (channels_g % 4 == 0 && num_output_g % 4 == 0)
+    {
+        pipeline_deconvolutiondepthwise_group_pack4 = new Pipeline(vkdev);
+        pipeline_deconvolutiondepthwise_group_pack4->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8));
+        pipeline_deconvolutiondepthwise_group_pack4->create("deconvolutiondepthwise_group_pack4", specializations, 4, 10);
+    }
+
+    // pack1to4
+    if (channels_g % 4 != 0 && num_output_g % 4 == 0)
+    {
+        pipeline_deconvolutiondepthwise_group_pack1to4 = new Pipeline(vkdev);
+        pipeline_deconvolutiondepthwise_group_pack1to4->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8));
+        pipeline_deconvolutiondepthwise_group_pack1to4->create("deconvolutiondepthwise_group_pack1to4", specializations, 4, 10);
+    }
+
+    // pack4to1
+    if (channels_g % 4 == 0 && num_output_g % 4 != 0)
+    {
+        pipeline_deconvolutiondepthwise_group_pack4to1 = new Pipeline(vkdev);
+        pipeline_deconvolutiondepthwise_group_pack4to1->set_optimal_local_size_xyz(32, 32, std::max(1, num_output / 8));
+        pipeline_deconvolutiondepthwise_group_pack4to1->create("deconvolutiondepthwise_group_pack4to1", specializations, 4, 10);
+    }
+
+    if (channels % 4 == 0 && channels_g % 4 != 0)
     {
-        deconvolution_group_ops[g]->create_pipeline();
+        packing_pack1->create_pipeline();
+    }
+
+    if (num_output_g % 4 != 0 && num_output % 4 == 0)
+    {
+        packing_pack4->create_pipeline();
     }
 
     return 0;
@@ -396,10 +619,11 @@ int DeconvolutionDepthWise::create_pipeline()
 
 int DeconvolutionDepthWise::destroy_pipeline()
 {
-    for (int g=0; g<(int)deconvolution_group_ops.size(); g++)
-    {
-        deconvolution_group_ops[g]->destroy_pipeline();
-    }
+    if (packing_pack1)
+        packing_pack1->destroy_pipeline();
+
+    if (packing_pack4)
+        packing_pack4->destroy_pipeline();
 
     delete pipeline_deconvolutiondepthwise;
     pipeline_deconvolutiondepthwise = 0;
@@ -407,6 +631,18 @@ int DeconvolutionDepthWise::destroy_pipeline()
     delete pipeline_deconvolutiondepthwise_pack4;
     pipeline_deconvolutiondepthwise_pack4 = 0;
 
+    delete pipeline_deconvolutiondepthwise_group;
+    pipeline_deconvolutiondepthwise_group = 0;
+
+    delete pipeline_deconvolutiondepthwise_group_pack4;
+    pipeline_deconvolutiondepthwise_group_pack4 = 0;
+
+    delete pipeline_deconvolutiondepthwise_group_pack1to4;
+    pipeline_deconvolutiondepthwise_group_pack1to4 = 0;
+
+    delete pipeline_deconvolutiondepthwise_group_pack4to1;
+    pipeline_deconvolutiondepthwise_group_pack4to1 = 0;
+
     return 0;
 }
 
@@ -423,10 +659,10 @@ int DeconvolutionDepthWise::forward(const VkMat& bottom_blob, VkMat& top_blob, V
 
     int outw = (w - 1) * stride_w + kernel_extent_w;
     int outh = (h - 1) * stride_h + kernel_extent_h;
+    int out_packing = num_output % 4 == 0 ? 4 : 1;
+    size_t out_elemsize = elemsize / packing * out_packing;
 
-    // TODO assert num_output % packing == 0
-
-    top_blob.create(outw, outh, num_output / packing, elemsize, packing, opt.blob_vkallocator, opt.staging_vkallocator);
+    top_blob.create(outw, outh, num_output / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
     if (top_blob.empty())
         return -100;
 
@@ -463,25 +699,94 @@ int DeconvolutionDepthWise::forward(const VkMat& bottom_blob, VkMat& top_blob, V
         return 0;
     }
 
-    // record
-    cmd.record_prepare_compute_barrier(top_blob);
+    const int channels_g = channels * packing / group;
+    const int num_output_g = num_output / group;
 
-    const int channels_g = channels / group;
-    const int num_output_g = num_output / packing / group;
+    // unpacking
+    VkMat bottom_blob_unpacked = bottom_blob;
+    if (packing == 4 && channels_g % 4 != 0)
+    {
+        ncnn::Option opt_pack1 = opt;
+        opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
+
+        packing_pack1->forward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1);
+    }
+
+    VkMat top_blob_unpacked = top_blob;
+    if (num_output_g % 4 != 0 && out_packing == 4)
+    {
+        top_blob_unpacked.create(outw, outh, num_output, elemsize / packing, 1, opt.workspace_vkallocator, opt.staging_vkallocator);
+        if (top_blob_unpacked.empty())
+            return -100;
+    }
 
-    for (int g=0; g<group; g++)
+    std::vector<VkMat> bindings(4);
+    bindings[0] = bottom_blob_unpacked;
+    bindings[1] = top_blob_unpacked;
+    if (channels_g % 4 != 0 && num_output_g % 4 != 0)
+    {
+        bindings[2] = weight_data_gpu;
+        bindings[3] = bias_term ? bias_data_gpu : weight_data_gpu;// TODO use dummy buffer
+    }
+    else if (channels_g % 4 == 0 && num_output_g % 4 == 0)
+    {
+        bindings[2] = weight_data_gpu_pack4;
+        bindings[3] = bias_term ? bias_data_gpu_pack4 : weight_data_gpu_pack4;// TODO use dummy buffer
+    }
+    else if (channels_g % 4 != 0 && num_output_g % 4 == 0)
     {
-        VkMat bottom_blob_bordered_g = bottom_blob.channel_range(channels_g * g, channels_g);
-        VkMat top_blob_g = top_blob.channel_range(num_output_g * g, num_output_g);
+        bindings[2] = weight_data_gpu_pack1to4;
+        bindings[3] = bias_term ? bias_data_gpu_pack4 : weight_data_gpu_pack1to4;// TODO use dummy buffer
+    }
+    else if (channels_g % 4 == 0 && num_output_g % 4 != 0)
+    {
+        bindings[2] = weight_data_gpu_pack4to1;
+        bindings[3] = bias_term ? bias_data_gpu : weight_data_gpu_pack4to1;// TODO use dummy buffer
+    }
 
-        const ncnn::Layer* op = deconvolution_group_ops[g];
+    std::vector<vk_constant_type> constants(10);
+    constants[0].i = bottom_blob_unpacked.dims;
+    constants[1].i = bottom_blob_unpacked.w;
+    constants[2].i = bottom_blob_unpacked.h;
+    constants[3].i = bottom_blob_unpacked.c;
+    constants[4].i = bottom_blob_unpacked.cstep;
+    constants[5].i = top_blob_unpacked.dims;
+    constants[6].i = top_blob_unpacked.w;
+    constants[7].i = top_blob_unpacked.h;
+    constants[8].i = top_blob_unpacked.c;
+    constants[9].i = top_blob_unpacked.cstep;
+
+    const Pipeline* pipeline = 0;
+    if (channels_g % 4 != 0 && num_output_g % 4 != 0)
+    {
+        pipeline = pipeline_deconvolutiondepthwise_group;
+    }
+    else if (channels_g % 4 == 0 && num_output_g % 4 == 0)
+    {
+        pipeline = pipeline_deconvolutiondepthwise_group_pack4;
+    }
+    else if (channels_g % 4 != 0 && num_output_g % 4 == 0)
+    {
+        pipeline = pipeline_deconvolutiondepthwise_group_pack1to4;
+    }
+    else if (channels_g % 4 == 0 && num_output_g % 4 != 0)
+    {
+        pipeline = pipeline_deconvolutiondepthwise_group_pack4to1;
+    }
 
-        ncnn::Option opt_g = opt;
-        opt_g.blob_vkallocator = top_blob.allocator;
-        opt_g.staging_vkallocator = top_blob.staging_allocator;
+    // record
+    cmd.record_prepare_compute_barrier(bottom_blob_unpacked);
+    cmd.record_prepare_compute_barrier(top_blob_unpacked);
+    cmd.record_pipeline(pipeline, bindings, constants, top_blob_unpacked);
 
-        // forward
-        op->forward(bottom_blob_bordered_g, top_blob_g, cmd, opt_g);
+    // packing
+    if (num_output_g % 4 != 0 && out_packing == 4)
+    {
+        packing_pack4->forward(top_blob_unpacked, top_blob, cmd, opt);
+    }
+    else
+    {
+        top_blob = top_blob_unpacked;
     }
 
     return 0;
diff --git a/src/layer/deconvolutiondepthwise.h b/src/layer/deconvolutiondepthwise.h
index 8cab20802..e9713ad4b 100644
--- a/src/layer/deconvolutiondepthwise.h
+++ b/src/layer/deconvolutiondepthwise.h
@@ -64,13 +64,27 @@ public:
     VkMat weight_data_gpu;
     VkMat bias_data_gpu;
 
-    std::vector<ncnn::Layer*> deconvolution_group_ops;
+    ncnn::Layer* packing_pack1;
+    ncnn::Layer* packing_pack4;
 
     Pipeline* pipeline_deconvolutiondepthwise;
 
+    // pack4
     VkMat weight_data_gpu_pack4;
     VkMat bias_data_gpu_pack4;
+
     Pipeline* pipeline_deconvolutiondepthwise_pack4;
+
+    Pipeline* pipeline_deconvolutiondepthwise_group;
+    Pipeline* pipeline_deconvolutiondepthwise_group_pack4;
+
+    // pack1to4
+    VkMat weight_data_gpu_pack1to4;
+    Pipeline* pipeline_deconvolutiondepthwise_group_pack1to4;
+
+    // pack4to1
+    VkMat weight_data_gpu_pack4to1;
+    Pipeline* pipeline_deconvolutiondepthwise_group_pack4to1;
 #endif // NCNN_VULKAN
 
 };
diff --git a/src/layer/shader/deconvolutiondepthwise_group.comp b/src/layer/shader/deconvolutiondepthwise_group.comp
new file mode 100644
index 000000000..a399010de
--- /dev/null
+++ b/src/layer/shader/deconvolutiondepthwise_group.comp
@@ -0,0 +1,117 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { float weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { float bias_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    float sum;
+
+    if (bias_term == 1)
+    {
+        sum = bias_data[gz];
+    }
+    else
+    {
+        sum = 0.f;
+    }
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    // group convolution
+    const int channels_g = p.c / group;
+    const int num_output_g = p.outc / group;
+
+    // group id
+    const int gg = gz / num_output_g;
+
+    int w_offset_0 = gz * channels_g * kernel_w * kernel_h;
+    int v_offset_0 = gg * channels_g * p.cstep;
+
+    for (int y = 0; y < kernel_h; y++)
+    {
+        int sys = (gy + y * dilation_h - (kernel_extent_h - 1));
+        if (sys % stride_h != 0)
+            continue;
+
+        int sy = sys / stride_h;
+        if (sy < 0 || sy >= p.h)
+            continue;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            int sxs = (gx + x * dilation_w - (kernel_extent_w - 1));
+            if (sxs % stride_w != 0)
+                continue;
+
+            int sx = sxs / stride_w;
+            if (sx < 0 || sx >= p.w)
+                continue;
+
+            int v_offset = v_offset_0 + sy * p.w + sx;
+            int w_offset = w_offset_0 + y * kernel_w + x;
+
+            for (int z = 0; z < channels_g; z++)
+            {
+                sum += weight_data[w_offset] * bottom_blob_data[v_offset];
+
+                v_offset += p.cstep;
+                w_offset += kernel_w * kernel_h;
+            }
+        }
+    }
+
+    top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sum;
+}
diff --git a/src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp b/src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp
new file mode 100644
index 000000000..8def83602
--- /dev/null
+++ b/src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp
@@ -0,0 +1,121 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { vec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { vec4 bias_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    vec4 sum;
+
+    if (bias_term == 1)
+    {
+        sum = bias_data[gz];
+    }
+    else
+    {
+        sum = vec4(0.f);
+    }
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    // group convolution
+    const int channels_g = p.c / group;
+    const int num_output_g = p.outc / group;
+
+    // group id
+    const int gg = gz / num_output_g;
+
+    int w_offset_0 = gz * channels_g * kernel_w * kernel_h;
+    int v_offset_0 = gg * channels_g * p.cstep;
+
+    for (int y = 0; y < kernel_h; y++)
+    {
+        int sys = (gy + y * dilation_h - (kernel_extent_h - 1));
+        if (sys % stride_h != 0)
+            continue;
+
+        int sy = sys / stride_h;
+        if (sy < 0 || sy >= p.h)
+            continue;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            int sxs = (gx + x * dilation_w - (kernel_extent_w - 1));
+            if (sxs % stride_w != 0)
+                continue;
+
+            int sx = sxs / stride_w;
+            if (sx < 0 || sx >= p.w)
+                continue;
+
+            int v_offset = v_offset_0 + sy * p.w + sx;
+            int w_offset = w_offset_0 + y * kernel_w + x;
+
+            for (int z = 0; z < channels_g; z++)
+            {
+                float v = bottom_blob_data[v_offset];
+
+                vec4 k = weight_data[w_offset];
+
+                sum += v * k;
+
+                v_offset += p.cstep;
+                w_offset += kernel_w * kernel_h;
+            }
+        }
+    }
+
+    top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sum;
+}
diff --git a/src/layer/shader/deconvolutiondepthwise_group_pack4.comp b/src/layer/shader/deconvolutiondepthwise_group_pack4.comp
new file mode 100644
index 000000000..65670c5c5
--- /dev/null
+++ b/src/layer/shader/deconvolutiondepthwise_group_pack4.comp
@@ -0,0 +1,121 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { mat4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { vec4 bias_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    vec4 sum;
+
+    if (bias_term == 1)
+    {
+        sum = bias_data[gz];
+    }
+    else
+    {
+        sum = vec4(0.f);
+    }
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    // group convolution
+    const int channels_g = p.c / group;
+    const int num_output_g = p.outc / group;
+
+    // group id
+    const int gg = gz / num_output_g;
+
+    int w_offset_0 = gz * channels_g * kernel_w * kernel_h;
+    int v_offset_0 = gg * channels_g * p.cstep;
+
+    for (int y = 0; y < kernel_h; y++)
+    {
+        int sys = (gy + y * dilation_h - (kernel_extent_h - 1));
+        if (sys % stride_h != 0)
+            continue;
+
+        int sy = sys / stride_h;
+        if (sy < 0 || sy >= p.h)
+            continue;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            int sxs = (gx + x * dilation_w - (kernel_extent_w - 1));
+            if (sxs % stride_w != 0)
+                continue;
+
+            int sx = sxs / stride_w;
+            if (sx < 0 || sx >= p.w)
+                continue;
+
+            int v_offset = v_offset_0 + sy * p.w + sx;
+            int w_offset = w_offset_0 + y * kernel_w + x;
+
+            for (int z = 0; z < channels_g; z++)
+            {
+                vec4 v = bottom_blob_data[v_offset];
+
+                mat4 k = weight_data[w_offset];
+
+                sum += v * k;
+
+                v_offset += p.cstep;
+                w_offset += kernel_w * kernel_h;
+            }
+        }
+    }
+
+    top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sum;
+}
diff --git a/src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp b/src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp
new file mode 100644
index 000000000..4f5948b69
--- /dev/null
+++ b/src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp
@@ -0,0 +1,121 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { vec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { float bias_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    float sum;
+
+    if (bias_term == 1)
+    {
+        sum = bias_data[gz];
+    }
+    else
+    {
+        sum = 0.f;
+    }
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    // group convolution
+    const int channels_g = p.c / group;
+    const int num_output_g = p.outc / group;
+
+    // group id
+    const int gg = gz / num_output_g;
+
+    int w_offset_0 = gz * channels_g * kernel_w * kernel_h;
+    int v_offset_0 = gg * channels_g * p.cstep;
+
+    for (int y = 0; y < kernel_h; y++)
+    {
+        int sys = (gy + y * dilation_h - (kernel_extent_h - 1));
+        if (sys % stride_h != 0)
+            continue;
+
+        int sy = sys / stride_h;
+        if (sy < 0 || sy >= p.h)
+            continue;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            int sxs = (gx + x * dilation_w - (kernel_extent_w - 1));
+            if (sxs % stride_w != 0)
+                continue;
+
+            int sx = sxs / stride_w;
+            if (sx < 0 || sx >= p.w)
+                continue;
+
+            int v_offset = v_offset_0 + sy * p.w + sx;
+            int w_offset = w_offset_0 + y * kernel_w + x;
+
+            for (int z = 0; z < channels_g; z++)
+            {
+                vec4 v = bottom_blob_data[v_offset];
+
+                vec4 k = weight_data[w_offset];
+
+                sum += dot(v, k);
+
+                v_offset += p.cstep;
+                w_offset += kernel_w * kernel_h;
+            }
+        }
+    }
+
+    top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sum;
+}