From c41bcd98a31eed3c0bc3aeb34b61e285bb713cc4 Mon Sep 17 00:00:00 2001 From: nihui Date: Sat, 9 Mar 2019 15:36:08 +0800 Subject: [PATCH] priorbox shader, fix permute order 1 on image, fix potential staging memory leak --- src/layer/priorbox.cpp | 188 +++++++++++++++++++++++++ src/layer/priorbox.h | 17 +++ src/layer/shader/permute_pack4to1.comp | 2 +- src/layer/shader/priorbox.comp | 131 +++++++++++++++++ src/layer/shader/priorbox_mxnet.comp | 82 +++++++++++ src/mat.h | 1 + 6 files changed, 420 insertions(+), 1 deletion(-) create mode 100644 src/layer/shader/priorbox.comp create mode 100644 src/layer/shader/priorbox_mxnet.comp diff --git a/src/layer/priorbox.cpp b/src/layer/priorbox.cpp index a104186ea..ba7c8e2b7 100644 --- a/src/layer/priorbox.cpp +++ b/src/layer/priorbox.cpp @@ -24,6 +24,12 @@ PriorBox::PriorBox() { one_blob_only = false; support_inplace = false; + support_vulkan = true; + +#if NCNN_VULKAN + pipeline_priorbox = 0; + pipeline_priorbox_mxnet = 0; +#endif // NCNN_VULKAN } int PriorBox::load_param(const ParamDict& pd) @@ -250,4 +256,186 @@ int PriorBox::forward(const std::vector& bottom_blobs, std::vector& to return 0; } +#if NCNN_VULKAN +int PriorBox::upload_model(VkTransfer& cmd) +{ + cmd.record_upload(min_sizes, min_sizes_gpu); + + if (max_sizes.w > 0) + cmd.record_upload(max_sizes, max_sizes_gpu); + + cmd.record_upload(aspect_ratios, aspect_ratios_gpu); + + return 0; +} + +int PriorBox::create_pipeline() +{ + // caffe style + { + int num_min_size = min_sizes.w; + int num_max_size = max_sizes.w; + int num_aspect_ratio = aspect_ratios.w; + + int num_prior = num_min_size * num_aspect_ratio + num_min_size + num_max_size; + if (flip) + num_prior += num_min_size * num_aspect_ratio; + + std::vector specializations(11); + specializations[0].i = flip; + specializations[1].i = clip; + specializations[2].f = offset; + specializations[3].f = variances[0]; + specializations[4].f = variances[1]; + specializations[5].f = variances[2]; + specializations[6].f = variances[3]; + specializations[7].i = num_min_size; + specializations[8].i = num_max_size; + specializations[9].i = num_aspect_ratio; + specializations[10].i = num_prior; + + pipeline_priorbox = new Pipeline(vkdev); + pipeline_priorbox->set_optimal_local_size_xyz(); + pipeline_priorbox->create("priorbox", specializations, 4, 6); + } + + // mxnet style + { + int num_sizes = min_sizes.w; + int num_ratios = aspect_ratios.w; + + int num_prior = num_sizes - 1 + num_ratios; + + std::vector specializations(5); + specializations[0].i = clip; + specializations[1].f = offset; + specializations[2].i = num_sizes; + specializations[3].i = num_ratios; + specializations[4].i = num_prior; + + pipeline_priorbox_mxnet = new Pipeline(vkdev); + pipeline_priorbox_mxnet->set_optimal_local_size_xyz(); + pipeline_priorbox_mxnet->create("priorbox_mxnet", specializations, 3, 4); + } + + return 0; +} + +int PriorBox::destroy_pipeline() +{ + delete pipeline_priorbox; + pipeline_priorbox = 0; + + delete pipeline_priorbox_mxnet; + pipeline_priorbox_mxnet = 0; + + return 0; +} + +int PriorBox::forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const +{ + int w = bottom_blobs[0].w; + int h = bottom_blobs[0].h; + + if (bottom_blobs.size() == 1 && image_width == -233 && image_height == -233 && max_sizes.empty()) + { + // mxnet style _contrib_MultiBoxPrior + float step_w = step_width; + float step_h = step_height; + if (step_w == -233) + step_w = 1.f / (float)w; + if (step_h == -233) + step_h = 1.f / (float)h; + + int num_sizes = min_sizes.w; + int num_ratios = aspect_ratios.w; + + int num_prior = num_sizes - 1 + num_ratios; + + VkMat& top_blob = top_blobs[0]; + top_blob.create(4 * w * h * num_prior, 4u, opt.blob_vkallocator, opt.staging_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = top_blob; + bindings[1] = min_sizes_gpu; + bindings[2] = aspect_ratios_gpu; + + std::vector constants(4); + constants[0].i = w; + constants[1].i = h; + constants[2].f = step_w; + constants[3].f = step_h; + + // record + cmd.record_prepare_compute_barrier(top_blob); + + VkMat dispatcher; + dispatcher.w = num_sizes; + dispatcher.h = w; + dispatcher.c = h; + + cmd.record_pipeline(pipeline_priorbox_mxnet, bindings, constants, dispatcher); + + return 0; + } + + int image_w = image_width; + int image_h = image_height; + if (image_w == -233) + image_w = bottom_blobs[1].w; + if (image_h == -233) + image_h = bottom_blobs[1].h; + + float step_w = step_width; + float step_h = step_height; + if (step_w == -233) + step_w = (float)image_w / w; + if (step_h == -233) + step_h = (float)image_h / h; + + int num_min_size = min_sizes.w; + int num_max_size = max_sizes.w; + int num_aspect_ratio = aspect_ratios.w; + + int num_prior = num_min_size * num_aspect_ratio + num_min_size + num_max_size; + if (flip) + num_prior += num_min_size * num_aspect_ratio; + + VkMat& top_blob = top_blobs[0]; + top_blob.create(4 * w * h * num_prior, 2, 4u, opt.blob_vkallocator, opt.staging_vkallocator); + if (top_blob.empty()) + return -100; + +// fprintf(stderr, "PriorBox::forward %p\n", top_blob.buffer()); + + std::vector bindings(4); + bindings[0] = top_blob; + bindings[1] = min_sizes_gpu; + bindings[2] = num_max_size > 0 ? max_sizes_gpu : min_sizes_gpu; + bindings[3] = aspect_ratios_gpu; + + std::vector constants(6); + constants[0].i = w; + constants[1].i = h; + constants[2].f = image_w; + constants[3].f = image_h; + constants[4].f = step_w; + constants[5].f = step_h; + + // record + cmd.record_prepare_compute_barrier(top_blob); + + VkMat dispatcher; + dispatcher.w = num_min_size; + dispatcher.h = w; + dispatcher.c = h; + + cmd.record_pipeline(pipeline_priorbox, bindings, constants, dispatcher); + + return 0; +} +#endif // NCNN_VULKAN + } // namespace ncnn diff --git a/src/layer/priorbox.h b/src/layer/priorbox.h index 249a65f25..63c52c9ef 100644 --- a/src/layer/priorbox.h +++ b/src/layer/priorbox.h @@ -28,6 +28,15 @@ public: virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +#if NCNN_VULKAN + virtual int upload_model(VkTransfer& cmd); + + virtual int create_pipeline(); + virtual int destroy_pipeline(); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; +#endif // NCNN_VULKAN + public: Mat min_sizes; Mat max_sizes; @@ -40,6 +49,14 @@ public: float step_width; float step_height; float offset; + +#if NCNN_VULKAN + VkMat min_sizes_gpu; + VkMat max_sizes_gpu; + VkMat aspect_ratios_gpu; + Pipeline* pipeline_priorbox; + Pipeline* pipeline_priorbox_mxnet; +#endif // NCNN_VULKAN }; } // namespace ncnn diff --git a/src/layer/shader/permute_pack4to1.comp b/src/layer/shader/permute_pack4to1.comp index d04a74c99..fef9eb618 100644 --- a/src/layer/shader/permute_pack4to1.comp +++ b/src/layer/shader/permute_pack4to1.comp @@ -61,7 +61,7 @@ void main() } if (order_type == 1) { - v_offset = ivec4(gx * p.outw + gy) + ivec4(0, 1, 2, 3); + v_offset = ivec4(gx * p.outw + gy * 4) + ivec4(0, 1, 2, 3); } } else if (p.dims == 3) diff --git a/src/layer/shader/priorbox.comp b/src/layer/shader/priorbox.comp new file mode 100644 index 000000000..5814a9c63 --- /dev/null +++ b/src/layer/shader/priorbox.comp @@ -0,0 +1,131 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int flip = 0; +layout (constant_id = 1) const int clip = 0; +layout (constant_id = 2) const float offset = 0; +layout (constant_id = 3) const float variances_0 = 0; +layout (constant_id = 4) const float variances_1 = 0; +layout (constant_id = 5) const float variances_2 = 0; +layout (constant_id = 6) const float variances_3 = 0; +layout (constant_id = 7) const int num_min_size = 0; +layout (constant_id = 8) const int num_max_size = 0; +layout (constant_id = 9) const int num_aspect_ratio = 0; +layout (constant_id = 10) const int num_prior = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) writeonly buffer top_blob { vec4 top_blob_data[]; }; +layout (binding = 1) readonly buffer min_sizes { float min_sizes_data[]; }; +layout (binding = 2) readonly buffer max_sizes { float max_sizes_data[]; }; +layout (binding = 3) readonly buffer aspect_ratios { float aspect_ratios_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + + float image_w; + float image_h; + float step_w; + float step_h; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= num_min_size || gy >= p.w || gz >= p.h) + return; + + // anchor and variance + int v_offset = (gz * p.w + gy) * num_prior + gx; + int var_offset = p.w * p.h * num_prior + v_offset; + + float center_x = (gy + offset) * p.step_w; + float center_y = (gz + offset) * p.step_h; + vec4 center = vec4(center_x, center_y, center_x, center_y); + + vec4 image_norm = 1.f / vec4(p.image_w, p.image_h, p.image_w, p.image_h); + + vec4 variance = vec4(variances_0, variances_1, variances_2, variances_3); + + vec4 box; + + float box_w; + float box_h; + + float min_size = min_sizes_data[gx]; + + // min size box + box_w = box_h = min_size; + + box = (center + vec4(-box_w, -box_h, box_w, box_h) * 0.5f) * image_norm; + + top_blob_data[v_offset] = clip == 1 ? clamp(box, 0.f, 1.f) : box; + top_blob_data[var_offset] = variance; + + v_offset += 1; + var_offset += 1; + + if (num_max_size > 0) + { + float max_size = max_sizes_data[gx]; + + // max size box + box_w = box_h = sqrt(min_size * max_size); + + box = (center + vec4(-box_w, -box_h, box_w, box_h) * 0.5f) * image_norm; + + top_blob_data[v_offset] = clip == 1 ? clamp(box, 0.f, 1.f) : box; + top_blob_data[var_offset] = variance; + + v_offset += 1; + var_offset += 1; + } + + // all aspect_ratios + for (int pi = 0; pi < num_aspect_ratio; pi++) + { + float ar = aspect_ratios_data[pi]; + + box_w = min_size * sqrt(ar); + box_h = min_size / sqrt(ar); + + box = (center + vec4(-box_w, -box_h, box_w, box_h) * 0.5f) * image_norm; + + top_blob_data[v_offset] = clip == 1 ? clamp(box, 0.f, 1.f) : box; + top_blob_data[var_offset] = variance; + + v_offset += 1; + var_offset += 1; + + if (flip == 1) + { + box = (center + vec4(-box_h, -box_w, box_h, box_w) * 0.5f) * image_norm; + + top_blob_data[v_offset] = clip == 1 ? clamp(box, 0.f, 1.f) : box; + top_blob_data[var_offset] = variance; + + v_offset += 1; + var_offset += 1; + } + } +} diff --git a/src/layer/shader/priorbox_mxnet.comp b/src/layer/shader/priorbox_mxnet.comp new file mode 100644 index 000000000..430ce7962 --- /dev/null +++ b/src/layer/shader/priorbox_mxnet.comp @@ -0,0 +1,82 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +layout (constant_id = 0) const int clip = 0; +layout (constant_id = 1) const float offset = 0; +layout (constant_id = 2) const int num_sizes = 0; +layout (constant_id = 3) const int num_ratios = 0; +layout (constant_id = 4) const int num_prior = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) writeonly buffer top_blob { vec4 top_blob_data[]; }; +layout (binding = 1) readonly buffer min_sizes { float min_sizes_data[]; }; +layout (binding = 2) readonly buffer aspect_ratios { float aspect_ratios_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + + float step_w; + float step_h; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= num_sizes || gy >= p.w || gz >= p.h) + return; + + // mxnet style _contrib_MultiBoxPrior + int v_offset = (gz * p.w + gy) * num_prior + gx; + + float center_x = (gy + offset) * p.step_w; + float center_y = (gz + offset) * p.step_h; + vec4 center = vec4(center_x, center_y, center_x, center_y); + + // ratio = 1, various sizes + float size = min_sizes_data[gx]; + float cw = size * p.h / p.w / 2; + float ch = size / 2; + + vec4 box = center + vec4(-cw, -ch, cw, ch); + + top_blob_data[v_offset] = clip == 1 ? clamp(box, 0.f, 1.f) : box; + + if (gx == num_sizes - 1) + { + // various ratios, size = min_size = size[0] + float size = min_sizes_data[0]; + for (int pi = 1; pi < num_ratios; pi++) + { + float ratio = sqrt(aspect_ratios_data[pi]); + float cwr = size * p.h / p.w * ratio / 2; + float chr = size / ratio / 2; +// float cwr = cw * ratio; +// float chr = ch / ratio; + + vec4 box = center + vec4(-cwr, -chr, cwr, chr); + + top_blob_data[v_offset + pi] = clip == 1 ? clamp(box, 0.f, 1.f) : box; + } + } +} diff --git a/src/mat.h b/src/mat.h index 1b77635b2..dd80f74a8 100644 --- a/src/mat.h +++ b/src/mat.h @@ -1421,6 +1421,7 @@ inline void VkMat::discard_staging_buffer() } staging_data = 0; + staging_refcount = 0; } inline void VkMat::upload(const Mat& m)