From c41bcd98a31eed3c0bc3aeb34b61e285bb713cc4 Mon Sep 17 00:00:00 2001
From: nihui <shuizhuyuanluo@126.com>
Date: Sat, 9 Mar 2019 15:36:08 +0800
Subject: [PATCH] priorbox shader, fix permute order 1 on image, fix potential
 staging memory leak

---
 src/layer/priorbox.cpp                 | 188 +++++++++++++++++++++++++
 src/layer/priorbox.h                   |  17 +++
 src/layer/shader/permute_pack4to1.comp |   2 +-
 src/layer/shader/priorbox.comp         | 131 +++++++++++++++++
 src/layer/shader/priorbox_mxnet.comp   |  82 +++++++++++
 src/mat.h                              |   1 +
 6 files changed, 420 insertions(+), 1 deletion(-)
 create mode 100644 src/layer/shader/priorbox.comp
 create mode 100644 src/layer/shader/priorbox_mxnet.comp
diff --git a/src/layer/priorbox.cpp b/src/layer/priorbox.cpp
index a104186ea..ba7c8e2b7 100644
--- a/src/layer/priorbox.cpp
+++ b/src/layer/priorbox.cpp
@@ -24,6 +24,12 @@ PriorBox::PriorBox()
 {
     one_blob_only = false;
     support_inplace = false;
+    support_vulkan = true;
+
+#if NCNN_VULKAN
+    pipeline_priorbox = 0;
+    pipeline_priorbox_mxnet = 0;
+#endif // NCNN_VULKAN
 }
 
 int PriorBox::load_param(const ParamDict& pd)
@@ -250,4 +256,186 @@ int PriorBox::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
     return 0;
 }
 
+#if NCNN_VULKAN
+int PriorBox::upload_model(VkTransfer& cmd)
+{
+    cmd.record_upload(min_sizes, min_sizes_gpu);
+
+    if (max_sizes.w > 0)
+        cmd.record_upload(max_sizes, max_sizes_gpu);
+
+    cmd.record_upload(aspect_ratios, aspect_ratios_gpu);
+
+    return 0;
+}
+
+int PriorBox::create_pipeline()
+{
+    // caffe style
+    {
+        int num_min_size = min_sizes.w;
+        int num_max_size = max_sizes.w;
+        int num_aspect_ratio = aspect_ratios.w;
+
+        int num_prior = num_min_size * num_aspect_ratio + num_min_size + num_max_size;
+        if (flip)
+            num_prior += num_min_size * num_aspect_ratio;
+
+        std::vector<vk_specialization_type> specializations(11);
+        specializations[0].i = flip;
+        specializations[1].i = clip;
+        specializations[2].f = offset;
+        specializations[3].f = variances[0];
+        specializations[4].f = variances[1];
+        specializations[5].f = variances[2];
+        specializations[6].f = variances[3];
+        specializations[7].i = num_min_size;
+        specializations[8].i = num_max_size;
+        specializations[9].i = num_aspect_ratio;
+        specializations[10].i = num_prior;
+
+        pipeline_priorbox = new Pipeline(vkdev);
+        pipeline_priorbox->set_optimal_local_size_xyz();
+        pipeline_priorbox->create("priorbox", specializations, 4, 6);
+    }
+
+    // mxnet style
+    {
+        int num_sizes = min_sizes.w;
+        int num_ratios = aspect_ratios.w;
+
+        int num_prior = num_sizes - 1 + num_ratios;
+
+        std::vector<vk_specialization_type> specializations(5);
+        specializations[0].i = clip;
+        specializations[1].f = offset;
+        specializations[2].i = num_sizes;
+        specializations[3].i = num_ratios;
+        specializations[4].i = num_prior;
+
+        pipeline_priorbox_mxnet = new Pipeline(vkdev);
+        pipeline_priorbox_mxnet->set_optimal_local_size_xyz();
+        pipeline_priorbox_mxnet->create("priorbox_mxnet", specializations, 3, 4);
+    }
+
+    return 0;
+}
+
+int PriorBox::destroy_pipeline()
+{
+    delete pipeline_priorbox;
+    pipeline_priorbox = 0;
+
+    delete pipeline_priorbox_mxnet;
+    pipeline_priorbox_mxnet = 0;
+
+    return 0;
+}
+
+int PriorBox::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const
+{
+    int w = bottom_blobs[0].w;
+    int h = bottom_blobs[0].h;
+
+    if (bottom_blobs.size() == 1 && image_width == -233 && image_height == -233 && max_sizes.empty())
+    {
+        // mxnet style _contrib_MultiBoxPrior
+        float step_w = step_width;
+        float step_h = step_height;
+        if (step_w == -233)
+            step_w = 1.f / (float)w;
+        if (step_h == -233)
+            step_h = 1.f / (float)h;
+
+        int num_sizes = min_sizes.w;
+        int num_ratios = aspect_ratios.w;
+
+        int num_prior = num_sizes - 1 + num_ratios;
+
+        VkMat& top_blob = top_blobs[0];
+        top_blob.create(4 * w * h * num_prior, 4u, opt.blob_vkallocator, opt.staging_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        std::vector<VkMat> bindings(3);
+        bindings[0] = top_blob;
+        bindings[1] = min_sizes_gpu;
+        bindings[2] = aspect_ratios_gpu;
+
+        std::vector<vk_constant_type> constants(4);
+        constants[0].i = w;
+        constants[1].i = h;
+        constants[2].f = step_w;
+        constants[3].f = step_h;
+
+        // record
+        cmd.record_prepare_compute_barrier(top_blob);
+
+        VkMat dispatcher;
+        dispatcher.w = num_sizes;
+        dispatcher.h = w;
+        dispatcher.c = h;
+
+        cmd.record_pipeline(pipeline_priorbox_mxnet, bindings, constants, dispatcher);
+
+        return 0;
+    }
+
+    int image_w = image_width;
+    int image_h = image_height;
+    if (image_w == -233)
+        image_w = bottom_blobs[1].w;
+    if (image_h == -233)
+        image_h = bottom_blobs[1].h;
+
+    float step_w = step_width;
+    float step_h = step_height;
+    if (step_w == -233)
+        step_w = (float)image_w / w;
+    if (step_h == -233)
+        step_h = (float)image_h / h;
+
+    int num_min_size = min_sizes.w;
+    int num_max_size = max_sizes.w;
+    int num_aspect_ratio = aspect_ratios.w;
+
+    int num_prior = num_min_size * num_aspect_ratio + num_min_size + num_max_size;
+    if (flip)
+        num_prior += num_min_size * num_aspect_ratio;
+
+    VkMat& top_blob = top_blobs[0];
+    top_blob.create(4 * w * h * num_prior, 2, 4u, opt.blob_vkallocator, opt.staging_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+//     fprintf(stderr, "PriorBox::forward %p\n", top_blob.buffer());
+
+    std::vector<VkMat> bindings(4);
+    bindings[0] = top_blob;
+    bindings[1] = min_sizes_gpu;
+    bindings[2] = num_max_size > 0 ? max_sizes_gpu : min_sizes_gpu;
+    bindings[3] = aspect_ratios_gpu;
+
+    std::vector<vk_constant_type> constants(6);
+    constants[0].i = w;
+    constants[1].i = h;
+    constants[2].f = image_w;
+    constants[3].f = image_h;
+    constants[4].f = step_w;
+    constants[5].f = step_h;
+
+    // record
+    cmd.record_prepare_compute_barrier(top_blob);
+
+    VkMat dispatcher;
+    dispatcher.w = num_min_size;
+    dispatcher.h = w;
+    dispatcher.c = h;
+
+    cmd.record_pipeline(pipeline_priorbox, bindings, constants, dispatcher);
+
+    return 0;
+}
+#endif // NCNN_VULKAN
+
 } // namespace ncnn
diff --git a/src/layer/priorbox.h b/src/layer/priorbox.h
index 249a65f25..63c52c9ef 100644
--- a/src/layer/priorbox.h
+++ b/src/layer/priorbox.h
@@ -28,6 +28,15 @@ public:
 
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
+#if NCNN_VULKAN
+    virtual int upload_model(VkTransfer& cmd);
+
+    virtual int create_pipeline();
+    virtual int destroy_pipeline();
+
+    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+#endif // NCNN_VULKAN
+
 public:
     Mat min_sizes;
     Mat max_sizes;
@@ -40,6 +49,14 @@ public:
     float step_width;
     float step_height;
     float offset;
+
+#if NCNN_VULKAN
+    VkMat min_sizes_gpu;
+    VkMat max_sizes_gpu;
+    VkMat aspect_ratios_gpu;
+    Pipeline* pipeline_priorbox;
+    Pipeline* pipeline_priorbox_mxnet;
+#endif // NCNN_VULKAN
 };
 
 } // namespace ncnn
diff --git a/src/layer/shader/permute_pack4to1.comp b/src/layer/shader/permute_pack4to1.comp
index d04a74c99..fef9eb618 100644
--- a/src/layer/shader/permute_pack4to1.comp
+++ b/src/layer/shader/permute_pack4to1.comp
@@ -61,7 +61,7 @@ void main()
         }
         if (order_type == 1)
         {
-            v_offset = ivec4(gx * p.outw + gy) + ivec4(0, 1, 2, 3);
+            v_offset = ivec4(gx * p.outw + gy * 4) + ivec4(0, 1, 2, 3);
         }
     }
     else if (p.dims == 3)
diff --git a/src/layer/shader/priorbox.comp b/src/layer/shader/priorbox.comp
new file mode 100644
index 000000000..5814a9c63
--- /dev/null
+++ b/src/layer/shader/priorbox.comp
@@ -0,0 +1,131 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int flip = 0;
+layout (constant_id = 1) const int clip = 0;
+layout (constant_id = 2) const float offset = 0;
+layout (constant_id = 3) const float variances_0 = 0;
+layout (constant_id = 4) const float variances_1 = 0;
+layout (constant_id = 5) const float variances_2 = 0;
+layout (constant_id = 6) const float variances_3 = 0;
+layout (constant_id = 7) const int num_min_size = 0;
+layout (constant_id = 8) const int num_max_size = 0;
+layout (constant_id = 9) const int num_aspect_ratio = 0;
+layout (constant_id = 10) const int num_prior = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) writeonly buffer top_blob { vec4 top_blob_data[]; };
+layout (binding = 1) readonly buffer min_sizes { float min_sizes_data[]; };
+layout (binding = 2) readonly buffer max_sizes { float max_sizes_data[]; };
+layout (binding = 3) readonly buffer aspect_ratios { float aspect_ratios_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+
+    float image_w;
+    float image_h;
+    float step_w;
+    float step_h;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= num_min_size || gy >= p.w || gz >= p.h)
+        return;
+
+    // anchor and variance
+    int v_offset = (gz * p.w + gy) * num_prior + gx;
+    int var_offset = p.w * p.h * num_prior + v_offset;
+
+    float center_x = (gy + offset) * p.step_w;
+    float center_y = (gz + offset) * p.step_h;
+    vec4 center = vec4(center_x, center_y, center_x, center_y);
+
+    vec4 image_norm = 1.f / vec4(p.image_w, p.image_h, p.image_w, p.image_h);
+
+    vec4 variance = vec4(variances_0, variances_1, variances_2, variances_3);
+
+    vec4 box;
+
+    float box_w;
+    float box_h;
+
+    float min_size = min_sizes_data[gx];
+
+    // min size box
+    box_w = box_h = min_size;
+
+    box = (center + vec4(-box_w, -box_h, box_w, box_h) * 0.5f) * image_norm;
+
+    top_blob_data[v_offset] = clip == 1 ? clamp(box, 0.f, 1.f) : box;
+    top_blob_data[var_offset] = variance;
+
+    v_offset += 1;
+    var_offset += 1;
+
+    if (num_max_size > 0)
+    {
+        float max_size = max_sizes_data[gx];
+
+        // max size box
+        box_w = box_h = sqrt(min_size * max_size);
+
+        box = (center + vec4(-box_w, -box_h, box_w, box_h) * 0.5f) * image_norm;
+
+        top_blob_data[v_offset] = clip == 1 ? clamp(box, 0.f, 1.f) : box;
+        top_blob_data[var_offset] = variance;
+
+        v_offset += 1;
+        var_offset += 1;
+    }
+
+    // all aspect_ratios
+    for (int pi = 0; pi < num_aspect_ratio; pi++)
+    {
+        float ar = aspect_ratios_data[pi];
+
+        box_w = min_size * sqrt(ar);
+        box_h = min_size / sqrt(ar);
+
+        box = (center + vec4(-box_w, -box_h, box_w, box_h) * 0.5f) * image_norm;
+
+        top_blob_data[v_offset] = clip == 1 ? clamp(box, 0.f, 1.f) : box;
+        top_blob_data[var_offset] = variance;
+
+        v_offset += 1;
+        var_offset += 1;
+
+        if (flip == 1)
+        {
+            box = (center + vec4(-box_h, -box_w, box_h, box_w) * 0.5f) * image_norm;
+
+            top_blob_data[v_offset] = clip == 1 ? clamp(box, 0.f, 1.f) : box;
+            top_blob_data[var_offset] = variance;
+
+            v_offset += 1;
+            var_offset += 1;
+        }
+    }
+}
diff --git a/src/layer/shader/priorbox_mxnet.comp b/src/layer/shader/priorbox_mxnet.comp
new file mode 100644
index 000000000..430ce7962
--- /dev/null
+++ b/src/layer/shader/priorbox_mxnet.comp
@@ -0,0 +1,82 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+layout (constant_id = 0) const int clip = 0;
+layout (constant_id = 1) const float offset = 0;
+layout (constant_id = 2) const int num_sizes = 0;
+layout (constant_id = 3) const int num_ratios = 0;
+layout (constant_id = 4) const int num_prior = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) writeonly buffer top_blob { vec4 top_blob_data[]; };
+layout (binding = 1) readonly buffer min_sizes { float min_sizes_data[]; };
+layout (binding = 2) readonly buffer aspect_ratios { float aspect_ratios_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+
+    float step_w;
+    float step_h;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= num_sizes || gy >= p.w || gz >= p.h)
+        return;
+
+    // mxnet style _contrib_MultiBoxPrior
+    int v_offset = (gz * p.w + gy) * num_prior + gx;
+
+    float center_x = (gy + offset) * p.step_w;
+    float center_y = (gz + offset) * p.step_h;
+    vec4 center = vec4(center_x, center_y, center_x, center_y);
+
+    // ratio = 1, various sizes
+    float size = min_sizes_data[gx];
+    float cw = size * p.h / p.w / 2;
+    float ch = size / 2;
+
+    vec4 box = center + vec4(-cw, -ch, cw, ch);
+
+    top_blob_data[v_offset] = clip == 1 ? clamp(box, 0.f, 1.f) : box;
+
+    if (gx == num_sizes - 1)
+    {
+        // various ratios, size = min_size = size[0]
+        float size = min_sizes_data[0];
+        for (int pi = 1; pi < num_ratios; pi++)
+        {
+            float ratio = sqrt(aspect_ratios_data[pi]);
+            float cwr = size * p.h / p.w * ratio / 2;
+            float chr = size / ratio / 2;
+//             float cwr = cw * ratio;
+//             float chr = ch / ratio;
+
+            vec4 box = center + vec4(-cwr, -chr, cwr, chr);
+
+            top_blob_data[v_offset + pi] = clip == 1 ? clamp(box, 0.f, 1.f) : box;
+        }
+    }
+}
diff --git a/src/mat.h b/src/mat.h
index 1b77635b2..dd80f74a8 100644
--- a/src/mat.h
+++ b/src/mat.h
@@ -1421,6 +1421,7 @@ inline void VkMat::discard_staging_buffer()
     }
 
     staging_data = 0;
+    staging_refcount = 0;
 }
 
 inline void VkMat::upload(const Mat& m)