From cd7559c6397942cf7f23faeb47c19276d0a250b2 Mon Sep 17 00:00:00 2001 From: nihuini Date: Wed, 29 May 2019 11:13:16 +0800 Subject: [PATCH] more fix for fp16p, still disabled by default --- src/command.cpp | 31 ++++---- src/gpu.cpp | 25 ++++-- src/layer/vulkan/convolution_vulkan.cpp | 18 +++-- .../vulkan/convolutiondepthwise_vulkan.cpp | 16 ++-- src/layer/vulkan/crop_vulkan.cpp | 12 +++ src/layer/vulkan/deconvolution_vulkan.cpp | 9 +-- .../vulkan/deconvolutiondepthwise_vulkan.cpp | 9 +-- src/layer/vulkan/flatten_vulkan.cpp | 40 +++++++--- src/layer/vulkan/flatten_vulkan.h | 1 + src/layer/vulkan/innerproduct_vulkan.cpp | 21 ++--- src/layer/vulkan/packing_vulkan.cpp | 17 ++++- src/layer/vulkan/permute_vulkan.cpp | 6 ++ src/layer/vulkan/priorbox_vulkan.cpp | 10 ++- src/layer/vulkan/reorg_vulkan.cpp | 6 ++ src/layer/vulkan/reshape_vulkan.cpp | 18 +++++ src/layer/vulkan/shader/flatten_pack1to4.comp | 76 +++++++++++++++++++ src/layer/vulkan/shader/padding_pack4.comp | 5 ++ src/layer/vulkan/shader/priorbox.comp | 24 +++++- src/net.cpp | 8 +- 19 files changed, 274 insertions(+), 78 deletions(-) create mode 100644 src/layer/vulkan/shader/flatten_pack1to4.comp diff --git a/src/command.cpp b/src/command.cpp index 22a54d7f8..62162bb2c 100644 --- a/src/command.cpp +++ b/src/command.cpp @@ -731,7 +731,7 @@ void VkCompute::dispatch(const uint32_t* group_count_xyz) void VkCompute::transfer_compute_barrier(VkBuffer buffer, size_t offset, size_t size) { -// fprintf(stderr, "cmd transfer_compute_barrier %p[+%lu]\n", buffer, offset); +// fprintf(stderr, "cmd transfer_compute_barrier %p[+%lu] %lu\n", buffer, offset, size); VkBufferMemoryBarrier bufferBarrier; bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; @@ -752,7 +752,7 @@ void VkCompute::transfer_compute_barrier(VkBuffer buffer, size_t offset, size_t void VkCompute::compute_transfer_barrier(VkBuffer buffer, size_t offset, size_t size) { -// fprintf(stderr, "cmd compute_transfer_barrier %p[+%lu]\n", buffer, offset); +// fprintf(stderr, "cmd compute_transfer_barrier %p[+%lu] %lu\n", buffer, offset, size); VkBufferMemoryBarrier bufferBarrier; bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; @@ -773,7 +773,7 @@ void VkCompute::compute_transfer_barrier(VkBuffer buffer, size_t offset, size_t void VkCompute::compute_compute_barrier(VkBuffer buffer, size_t offset, size_t size) { -// fprintf(stderr, "cmd compute_compute_barrier %p[+%lu]\n", buffer, offset); +// fprintf(stderr, "cmd compute_compute_barrier %p[+%lu] %lu\n", buffer, offset, size); VkBufferMemoryBarrier bufferBarrier; bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; @@ -794,7 +794,7 @@ void VkCompute::compute_compute_barrier(VkBuffer buffer, size_t offset, size_t s void VkCompute::transfer_transfer_barrier(VkBuffer buffer, size_t offset, size_t size) { -// fprintf(stderr, "cmd transfer_transfer_barrier %p[+%lu]\n", buffer, offset); +// fprintf(stderr, "cmd transfer_transfer_barrier %p[+%lu] %lu\n", buffer, offset, size); VkBufferMemoryBarrier bufferBarrier; bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; @@ -843,30 +843,35 @@ VkTransfer::~VkTransfer() void VkTransfer::record_upload(const Mat& src, VkMat& dst) { - if ((vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage) && src.elemsize / src.packing == 4) + if (src.elemsize / src.packing == 4) { - Mat src_fp16; - cast_float32_to_float16(src, src_fp16); + if (vkdev->info.support_fp16_storage || (vkdev->info.support_fp16_packed && src.packing % 4 == 0)) + { + Mat src_fp16; + cast_float32_to_float16(src, src_fp16); - record_upload(src_fp16, dst); + record_upload(src_fp16, dst); - return; + return; + } } - dst.create_like(src, weight_vkallocator, staging_vkallocator); + Mat src_flattened = src.reshape(src.w * src.h * src.c); + + dst.create_like(src_flattened, weight_vkallocator, staging_vkallocator); // set weight blob as readonly dst.data->state = 4; if (dst.allocator->mappable) { - dst.upload(src); + dst.upload(src_flattened); return; } record_type r; - r.size = src.total() * src.elemsize; - r.mat = src; + r.size = src_flattened.total() * src_flattened.elemsize; + r.mat = src_flattened; r.vkmat = dst; delayed_records.push_back(r); } diff --git a/src/gpu.cpp b/src/gpu.cpp index 3e8b1818a..dd069b986 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -712,8 +712,8 @@ int create_gpu_instance() gpu_info.transfer_queue_family_index, gpu_info.transfer_queue_count, gpu_info.unified_memory_index, gpu_info.device_local_memory_index, gpu_info.host_visible_memory_index); - fprintf(stderr, "[%u %s] fp16s=%d fp16a=%d int8s=%d int8a=%d\n", i, physicalDeviceProperties.deviceName, - gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic, + fprintf(stderr, "[%u %s] fp16p=%d fp16s=%d fp16a=%d int8s=%d int8a=%d\n", i, physicalDeviceProperties.deviceName, + gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic, gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic); gpu_info_index++; @@ -999,6 +999,15 @@ VkAllocator* VulkanDevice::staging_allocator() const return staging_buffer_allocator; } +static inline bool string_ends_with_fp16p(const char* name) +{ + int len = strlen(name); + if (len < 6) + return false; + + return memcmp(name + len - 6, "_fp16p", 6) == 0; +} + static inline bool string_ends_with_fp16s(const char* name) { int len = strlen(name); @@ -1025,15 +1034,15 @@ int VulkanDevice::create_shader_module() { const char* shader_name = layer_shader_registry[i].name; - if (!info.support_fp16_storage) + if (!info.support_fp16_packed) { - if (string_ends_with_fp16s(shader_name)) - continue; - - if (strcmp(shader_name, "cast_fp16_to_fp32") == 0 || strcmp(shader_name, "cast_fp16_to_fp32_pack4") == 0) + if (string_ends_with_fp16p(shader_name)) continue; + } - if (strcmp(shader_name, "cast_fp32_to_fp16") == 0 || strcmp(shader_name, "cast_fp32_to_fp16_pack4") == 0) + if (!info.support_fp16_storage) + { + if (string_ends_with_fp16s(shader_name)) continue; } diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp index 091ff1999..d54f3cef8 100644 --- a/src/layer/vulkan/convolution_vulkan.cpp +++ b/src/layer/vulkan/convolution_vulkan.cpp @@ -315,7 +315,7 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd) { Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); - weight_data_pack4.create(16*maxk, num_input/4, num_output/4); + weight_data_pack4.create(maxk, num_input/4, num_output/4, (size_t)4*16, 16); for (int q=0; q+3= 16 && num_output >= 16) @@ -434,7 +433,7 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd) // dst = 4a-4b-16-inch/4a-outch/4b Mat weight_data_pack4_tm; { - weight_data_pack4_tm.create(16*16, num_input/4, num_output/4); + weight_data_pack4_tm.create(16, num_input/4, num_output/4, (size_t)4*16, 16); for (int q=0; q+3info.support_fp16_packed && !vkdev->info.support_fp16_storage) + { + if (out_packing == 4) out_elemsize = 4*2u; + if (out_packing == 1) out_elemsize = 4u; + } + bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; if (packing == 4 && out_packing == 4 && is_conv3x3s1d1 && channels * packing >= 16 && num_output >= 16) { diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp index 4a56e2691..c23e64abf 100644 --- a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp +++ b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp @@ -224,7 +224,6 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd) Mat weight_data_r2 = weight_data.reshape(maxk, group); convert_packing(weight_data_r2, weight_data_pack4, 4); - weight_data_pack4 = weight_data_pack4.reshape(maxk * (group/4)); cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4); } @@ -265,7 +264,7 @@ int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd) { Mat weight_data_r2_groups = weight_data.reshape(maxk, channels_g, num_output_g * group); - weight_data_pack4_groups.create(16*maxk, channels_g/4, num_output_g/4 * group); + weight_data_pack4_groups.create(maxk, channels_g/4, num_output_g/4 * group, (size_t)4*16, 16); for (int g=0; ginfo.support_fp16_packed && !vkdev->info.support_fp16_storage) + { + if (out_packing == 4) out_elemsize = 4*2u; + if (out_packing == 1) out_elemsize = 4u; + } + top_blob.create(outw, outh, num_output / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); if (top_blob.empty()) return -100; diff --git a/src/layer/vulkan/crop_vulkan.cpp b/src/layer/vulkan/crop_vulkan.cpp index d5fa9ab86..82cc98afa 100644 --- a/src/layer/vulkan/crop_vulkan.cpp +++ b/src/layer/vulkan/crop_vulkan.cpp @@ -98,6 +98,12 @@ int Crop_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c int out_packing = _outc % 4 == 0 ? 4 : 1; size_t out_elemsize = elemsize / packing * out_packing; + if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) + { + if (out_packing == 4) out_elemsize = 4*2u; + if (out_packing == 1) out_elemsize = 4u; + } + top_blob.create(_outw, _outh, _outc / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); if (top_blob.empty()) return -100; @@ -189,6 +195,12 @@ int Crop_vulkan::forward(const std::vector& bottom_blobs, std::vectorinfo.support_fp16_packed && !vkdev->info.support_fp16_storage) + { + if (out_packing == 4) out_elemsize = 4*2u; + if (out_packing == 1) out_elemsize = 4u; + } + VkMat& top_blob = top_blobs[0]; top_blob.create(_outw, _outh, _outc / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); diff --git a/src/layer/vulkan/deconvolution_vulkan.cpp b/src/layer/vulkan/deconvolution_vulkan.cpp index dccdf5fb3..2928ecca4 100644 --- a/src/layer/vulkan/deconvolution_vulkan.cpp +++ b/src/layer/vulkan/deconvolution_vulkan.cpp @@ -158,7 +158,7 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd) { Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output); - weight_data_pack4.create(16*maxk, num_input/4, num_output/4); + weight_data_pack4.create(maxk, num_input/4, num_output/4, (size_t)4*16, 16); for (int q=0; q+3create("flatten_pack4", specializations, 2, 10); } + // pack1to4 + { + pipeline_flatten_pack1to4 = new Pipeline(vkdev); + pipeline_flatten_pack1to4->set_optimal_local_size_xyz(); + pipeline_flatten_pack1to4->create("flatten_pack1to4", specializations, 2, 10); + } + return 0; } @@ -55,6 +63,9 @@ int Flatten_vulkan::destroy_pipeline(const Option& opt) delete pipeline_flatten_pack4; pipeline_flatten_pack4 = 0; + delete pipeline_flatten_pack1to4; + pipeline_flatten_pack1to4 = 0; + return 0; } @@ -79,6 +90,12 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute int out_packing = total % 4 == 0 ? 4 : 1; size_t out_elemsize = elemsize / packing * out_packing; + if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) + { + if (out_packing == 4) out_elemsize = 4*2u; + if (out_packing == 1) out_elemsize = 4u; + } + if (dims == 2 && packing == 1) { top_blob = bottom_blob; @@ -106,25 +123,26 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute constants[3].i = bottom_blob.c; constants[4].i = bottom_blob.cstep; constants[5].i = top_blob.dims; - constants[6].i = (packing == 1 && out_packing == 4) ? total : top_blob.w; + constants[6].i = top_blob.w; constants[7].i = top_blob.h; constants[8].i = top_blob.c; constants[9].i = top_blob.cstep; - const Pipeline* pipeline = packing == 4 ? pipeline_flatten_pack4 : pipeline_flatten; - - if (packing == 1 && out_packing == 4) + const Pipeline* pipeline = 0; + if (packing == 1 && out_packing == 1) { - VkMat dispatcher; - dispatcher.w = total; - dispatcher.h = 1; - dispatcher.c = 1; - cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + pipeline = pipeline_flatten; } - else + else if (packing == 4 /*&& out_packing == 4*/) { - cmd.record_pipeline(pipeline, bindings, constants, top_blob); + pipeline = pipeline_flatten_pack4; } + else if (packing == 1 && out_packing == 4) + { + pipeline = pipeline_flatten_pack1to4; + } + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); return 0; } diff --git a/src/layer/vulkan/flatten_vulkan.h b/src/layer/vulkan/flatten_vulkan.h index 72697dde2..ad75ac031 100644 --- a/src/layer/vulkan/flatten_vulkan.h +++ b/src/layer/vulkan/flatten_vulkan.h @@ -32,6 +32,7 @@ public: public: Pipeline* pipeline_flatten; Pipeline* pipeline_flatten_pack4; + Pipeline* pipeline_flatten_pack1to4; }; } // namespace ncnn diff --git a/src/layer/vulkan/innerproduct_vulkan.cpp b/src/layer/vulkan/innerproduct_vulkan.cpp index 776fceebd..13427c56c 100644 --- a/src/layer/vulkan/innerproduct_vulkan.cpp +++ b/src/layer/vulkan/innerproduct_vulkan.cpp @@ -141,7 +141,7 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd) { Mat weight_data_r2 = weight_data.reshape(num_input, num_output); - weight_data_pack4.create(16, num_input/4, num_output/4); + weight_data_pack4.create(num_input/4, num_output/4, (size_t)4*16, 16); for (int q=0; q+3info.support_fp16_packed && !vkdev->info.support_fp16_storage) + { + if (out_packing == 4) out_elemsize = 4*2u; + if (out_packing == 1) out_elemsize = 4u; + } + top_blob.create(num_output / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); if (top_blob.empty()) return -100; diff --git a/src/layer/vulkan/packing_vulkan.cpp b/src/layer/vulkan/packing_vulkan.cpp index 8a517c38f..8e4c3fbba 100644 --- a/src/layer/vulkan/packing_vulkan.cpp +++ b/src/layer/vulkan/packing_vulkan.cpp @@ -96,7 +96,7 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute if (dims == 1) { - if (out_packing == 1) + if (vkdev->info.support_fp16_storage && out_packing == 1) { top_blob = bottom_blob; top_blob.w = w * packing; @@ -108,6 +108,11 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute int outw = (w * packing + out_packing - 1) / out_packing; size_t out_elemsize = elemsize / packing * out_packing; + if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) + { + if (out_packing == 4) out_elemsize = 4*2u; + if (out_packing == 1) out_elemsize = 4u; + } top_blob.create(outw, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); if (top_blob.empty()) @@ -118,6 +123,11 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute { int outh = (h * packing + out_packing - 1) / out_packing; size_t out_elemsize = elemsize / packing * out_packing; + if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) + { + if (out_packing == 4) out_elemsize = 4*2u; + if (out_packing == 1) out_elemsize = 4u; + } top_blob.create(w, outh, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); if (top_blob.empty()) @@ -128,6 +138,11 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute { int outc = (channels * packing + out_packing - 1) / out_packing; size_t out_elemsize = elemsize / packing * out_packing; + if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) + { + if (out_packing == 4) out_elemsize = 4*2u; + if (out_packing == 1) out_elemsize = 4u; + } top_blob.create(w, h, outc, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); if (top_blob.empty()) diff --git a/src/layer/vulkan/permute_vulkan.cpp b/src/layer/vulkan/permute_vulkan.cpp index ed58f14b5..5d8a210d1 100644 --- a/src/layer/vulkan/permute_vulkan.cpp +++ b/src/layer/vulkan/permute_vulkan.cpp @@ -72,6 +72,12 @@ int Permute_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute int out_packing = 1; size_t out_elemsize = elemsize / packing; + if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) + { + if (out_packing == 4) out_elemsize = 4*2u; + if (out_packing == 1) out_elemsize = 4u; + } + if (dims == 2) { // order_type diff --git a/src/layer/vulkan/priorbox_vulkan.cpp b/src/layer/vulkan/priorbox_vulkan.cpp index 7eaf4f3f5..2e1859846 100644 --- a/src/layer/vulkan/priorbox_vulkan.cpp +++ b/src/layer/vulkan/priorbox_vulkan.cpp @@ -107,6 +107,12 @@ int PriorBox_vulkan::forward(const std::vector& bottom_blobs, std::vector { int w = bottom_blobs[0].w; int h = bottom_blobs[0].h; + size_t elemsize = 4u; + + if (vkdev->info.support_fp16_storage) + { + elemsize = 2u; + } if (bottom_blobs.size() == 1 && image_width == -233 && image_height == -233 && max_sizes.empty()) { @@ -124,7 +130,7 @@ int PriorBox_vulkan::forward(const std::vector& bottom_blobs, std::vector int num_prior = num_sizes - 1 + num_ratios; VkMat& top_blob = top_blobs[0]; - top_blob.create(4 * w * h * num_prior, 4u, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(4 * w * h * num_prior, elemsize, 1, opt.blob_vkallocator, opt.staging_vkallocator); if (top_blob.empty()) return -100; @@ -172,7 +178,7 @@ int PriorBox_vulkan::forward(const std::vector& bottom_blobs, std::vector num_prior += num_min_size * num_aspect_ratio; VkMat& top_blob = top_blobs[0]; - top_blob.create(4 * w * h * num_prior, 2, 4u, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(4 * w * h * num_prior, 2, elemsize, 1, opt.blob_vkallocator, opt.staging_vkallocator); if (top_blob.empty()) return -100; diff --git a/src/layer/vulkan/reorg_vulkan.cpp b/src/layer/vulkan/reorg_vulkan.cpp index be5068c3c..708c0d6ca 100644 --- a/src/layer/vulkan/reorg_vulkan.cpp +++ b/src/layer/vulkan/reorg_vulkan.cpp @@ -85,6 +85,12 @@ int Reorg_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& int out_packing = outc % 4 == 0 ? 4 : 1; size_t out_elemsize = elemsize / packing * out_packing; + if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) + { + if (out_packing == 4) out_elemsize = 4*2u; + if (out_packing == 1) out_elemsize = 4u; + } + top_blob.create(outw, outh, outc / out_packing, out_elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); if (top_blob.empty()) return -100; diff --git a/src/layer/vulkan/reshape_vulkan.cpp b/src/layer/vulkan/reshape_vulkan.cpp index eb693d537..e2bca5d88 100644 --- a/src/layer/vulkan/reshape_vulkan.cpp +++ b/src/layer/vulkan/reshape_vulkan.cpp @@ -105,6 +105,12 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute out_packing = _w % 4 == 0 ? 4 : 1; size_t out_elemsize = elemsize / packing * out_packing; + if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) + { + if (out_packing == 4) out_elemsize = 4*2u; + if (out_packing == 1) out_elemsize = 4u; + } + if (dims == 1 && bottom_blob.w == _w && packing == out_packing) { top_blob = bottom_blob; @@ -131,6 +137,12 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute out_packing = _h % 4 == 0 ? 4 : 1; size_t out_elemsize = elemsize / packing * out_packing; + if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) + { + if (out_packing == 4) out_elemsize = 4*2u; + if (out_packing == 1) out_elemsize = 4u; + } + if (dims == 2 && bottom_blob.h == _h && packing == out_packing) { top_blob = bottom_blob; @@ -162,6 +174,12 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute out_packing = _c % 4 == 0 ? 4 : 1; size_t out_elemsize = elemsize / packing * out_packing; + if (vkdev->info.support_fp16_packed && !vkdev->info.support_fp16_storage) + { + if (out_packing == 4) out_elemsize = 4*2u; + if (out_packing == 1) out_elemsize = 4u; + } + if (dims == 3 && bottom_blob.c == _c && packing == out_packing) { top_blob = bottom_blob; diff --git a/src/layer/vulkan/shader/flatten_pack1to4.comp b/src/layer/vulkan/shader/flatten_pack1to4.comp new file mode 100644 index 000000000..4e8992f38 --- /dev/null +++ b/src/layer/vulkan/shader/flatten_pack1to4.comp @@ -0,0 +1,76 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_AMD_gpu_shader_half_float: require +#endif + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= 1 || gz >= 1) + return; + + ivec4 i4 = gx * 4 + ivec4(0, 1, 2, 3); + + int size = p.w * p.h; + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / p.w; + ivec4 x4 = i4 % size % p.w; + + ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4; + +#if NCNN_fp16_packed + vec2 v0 = vec2(bottom_blob_data[v_offset.r], bottom_blob_data[v_offset.g]); + vec2 v1 = vec2(bottom_blob_data[v_offset.b], bottom_blob_data[v_offset.a]); + + top_blob_data[gx] = uvec2(packHalf2x16(v0), packHalf2x16(v1)); +#else + top_blob_data[gx].r = bottom_blob_data[v_offset.r]; + top_blob_data[gx].g = bottom_blob_data[v_offset.g]; + top_blob_data[gx].b = bottom_blob_data[v_offset.b]; + top_blob_data[gx].a = bottom_blob_data[v_offset.a]; +#endif +} diff --git a/src/layer/vulkan/shader/padding_pack4.comp b/src/layer/vulkan/shader/padding_pack4.comp index 466ca9249..82b0e5808 100644 --- a/src/layer/vulkan/shader/padding_pack4.comp +++ b/src/layer/vulkan/shader/padding_pack4.comp @@ -70,7 +70,12 @@ void main() } else { +#if NCNN_fp16_packed + uint v = packHalf2x16(vec2(value)); + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = uvec2(v, v); +#else top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(value); +#endif } } else if (type == 1) diff --git a/src/layer/vulkan/shader/priorbox.comp b/src/layer/vulkan/shader/priorbox.comp index 3e1d4f17a..031b5bdcc 100644 --- a/src/layer/vulkan/shader/priorbox.comp +++ b/src/layer/vulkan/shader/priorbox.comp @@ -37,7 +37,11 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; +#if NCNN_fp16_packed +layout (binding = 0) writeonly buffer top_blob { vec4 top_blob_data[]; }; +#else layout (binding = 0) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif layout (binding = 1) readonly buffer min_sizes { sfp min_sizes_data[]; }; layout (binding = 2) readonly buffer max_sizes { sfp max_sizes_data[]; }; layout (binding = 3) readonly buffer aspect_ratios { sfp aspect_ratios_data[]; }; @@ -80,9 +84,7 @@ void main() afp min_size = sfp2afp(min_sizes_data[gx]); #if NCNN_fp16_packed - vec2 v0 = vec2(variances_0, variances_1); - vec2 v1 = vec2(variances_2, variances_3); - uvec2 variances = uvec2(packHalf2x16(v0), packHalf2x16(v1)); + vec4 variances = vec4(variances_0, variances_1, variances_2, variances_3); #elif !NCNN_fp16_storage // per component assignment makes qcom-adreno driver unhappy :( sfpvec4 variances = sfpvec4(variances_0, variances_1, variances_2, variances_3); @@ -94,7 +96,11 @@ void main() box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box; +#if NCNN_fp16_packed + top_blob_data[v_offset] = vec4(box); +#else top_blob_data[v_offset] = afp2sfpvec4(box); +#endif #if NCNN_fp16_packed || !NCNN_fp16_storage top_blob_data[var_offset] = variances; #else @@ -117,7 +123,11 @@ void main() box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box; +#if NCNN_fp16_packed + top_blob_data[v_offset] = vec4(box); +#else top_blob_data[v_offset] = afp2sfpvec4(box); +#endif #if NCNN_fp16_packed || !NCNN_fp16_storage top_blob_data[var_offset] = variances; #else @@ -141,7 +151,11 @@ void main() box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box; +#if NCNN_fp16_packed + top_blob_data[v_offset] = vec4(box); +#else top_blob_data[v_offset] = afp2sfpvec4(box); +#endif #if NCNN_fp16_packed || !NCNN_fp16_storage top_blob_data[var_offset] = variances; #else @@ -159,7 +173,11 @@ void main() box = (center + afpvec4(-box_h, -box_w, box_h, box_w) * afp(0.5f)) * image_norm; box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box; +#if NCNN_fp16_packed + top_blob_data[v_offset] = vec4(box); +#else top_blob_data[v_offset] = afp2sfpvec4(box); +#endif #if NCNN_fp16_packed || !NCNN_fp16_storage top_blob_data[var_offset] = variances; #else diff --git a/src/net.cpp b/src/net.cpp index 0af36a3ff..c813952f7 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -1407,7 +1407,7 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector // cast to fp16 VkMat bottom_blob_unpacked_fp16; - if (vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage) + if (vkdev->info.support_fp16_storage) { cast_float32_to_float16->forward(bottom_blob_unpacked, bottom_blob_unpacked_fp16, cmd, opt); } @@ -1509,7 +1509,7 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector // cast to fp16 VkMat bottom_blob_unpacked_fp16; - if (vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage) + if (vkdev->info.support_fp16_storage) { cast_float32_to_float16->forward(bottom_blob_unpacked, bottom_blob_unpacked_fp16, cmd, opt); } @@ -1638,7 +1638,7 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector // cast to fp32 VkMat bottom_blob_unpacked_fp32; - if (vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage) + if (vkdev->info.support_fp16_storage) { cast_float16_to_float32->forward(bottom_blob_unpacked, bottom_blob_unpacked_fp32, cmd, opt); } @@ -1773,7 +1773,7 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector packing_pack1->forward(bottom_blob, bottom_blob_unpacked, cmd, opt); // cast to fp32 - if (vkdev->info.support_fp16_packed || vkdev->info.support_fp16_storage) + if (vkdev->info.support_fp16_storage) { cast_float16_to_float32->forward(bottom_blob_unpacked, bottom_blobs_unpacked_fp32[i], cmd, opt); }