From c60773bde474591e33d1ee8d70f9eca41beaf222 Mon Sep 17 00:00:00 2001 From: nihui Date: Fri, 1 Feb 2019 22:52:20 +0800 Subject: [PATCH] add transfer-transfer barrier, concat pack4 --- src/command.cpp | 44 +++++++++++++++++++++++++++++++++++++++++++- src/command.h | 7 ++++++- src/layer/concat.cpp | 14 +++++++++++--- 3 files changed, 60 insertions(+), 5 deletions(-) diff --git a/src/command.cpp b/src/command.cpp index f7c7b932e..fa1d236e1 100644 --- a/src/command.cpp +++ b/src/command.cpp @@ -445,8 +445,26 @@ void VkCompute::record_compute_compute_barrier(const VkMat& m) delayed_records.push_back(r); } +void VkCompute::record_transfer_transfer_barrier(const VkMat& m) +{ + m.state = 2; + + if (vkdev->info.support_VK_KHR_push_descriptor) + return transfer_transfer_barrier(m.buffer(), m.buffer_offset(), m.total() * m.elemsize); + + record_type r; + r.type = 10; + r.transfer_transfer_barrier.buffer = m.buffer(); + r.transfer_transfer_barrier.offset = m.buffer_offset(); + r.transfer_transfer_barrier.size = m.total() * m.elemsize; + delayed_records.push_back(r); +} + void VkCompute::record_prepare_transfer_barrier(const VkMat& m) { + if (m.state == 2) + return record_transfer_transfer_barrier(m); + if (m.state == 3) return record_compute_transfer_barrier(m); @@ -470,7 +488,7 @@ int VkCompute::end() return end_command_buffer(); record_type r; - r.type = 10; + r.type = 11; delayed_records.push_back(r); return 0; @@ -519,6 +537,9 @@ int VkCompute::submit() compute_compute_barrier(r.compute_compute_barrier.buffer, r.compute_compute_barrier.offset, r.compute_compute_barrier.size); break; case 10: + transfer_transfer_barrier(r.compute_compute_barrier.buffer, r.compute_compute_barrier.offset, r.compute_compute_barrier.size); + break; + case 11: end_command_buffer(); break; } @@ -649,6 +670,27 @@ void VkCompute::compute_compute_barrier(VkBuffer buffer, size_t offset, size_t s vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); } +void VkCompute::transfer_transfer_barrier(VkBuffer buffer, size_t offset, size_t size) +{ +// fprintf(stderr, "cmd transfer_transfer_barrier %p\n", buffer); + + VkBufferMemoryBarrier bufferBarrier; + bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + bufferBarrier.pNext = 0; + bufferBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + bufferBarrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + bufferBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + bufferBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + bufferBarrier.buffer = buffer; + bufferBarrier.offset = offset; + bufferBarrier.size = size; + + VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; + VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; + + vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); +} + VkTransfer::VkTransfer(VulkanDevice* _vkdev) : Command(_vkdev, _vkdev->info.transfer_queue_index) { staging_data = 0; diff --git a/src/command.h b/src/command.h index 9f6a703d6..67a096e93 100644 --- a/src/command.h +++ b/src/command.h @@ -78,6 +78,8 @@ public: void record_compute_compute_barrier(const VkMat& m); + void record_transfer_transfer_barrier(const VkMat& m); + void record_prepare_transfer_barrier(const VkMat& m); void record_prepare_compute_barrier(const VkMat& m); @@ -109,6 +111,7 @@ protected: void transfer_compute_barrier(VkBuffer buffer, size_t offset, size_t size); void compute_transfer_barrier(VkBuffer buffer, size_t offset, size_t size); void compute_compute_barrier(VkBuffer buffer, size_t offset, size_t size); + void transfer_transfer_barrier(VkBuffer buffer, size_t offset, size_t size); protected: // delayed record @@ -127,7 +130,8 @@ protected: // 7=transfer-compute barrier // 8=compute-transfer barrier // 9=compute-compute barrier - // 10=end + // 10=transfer-transfer barrier + // 11=end int type; union @@ -141,6 +145,7 @@ protected: struct { VkBuffer buffer; size_t offset; size_t size; } transfer_compute_barrier; struct { VkBuffer buffer; size_t offset; size_t size; } compute_transfer_barrier; struct { VkBuffer buffer; size_t offset; size_t size; } compute_compute_barrier; + struct { VkBuffer buffer; size_t offset; size_t size; } transfer_transfer_barrier; }; std::vector regions; diff --git a/src/layer/concat.cpp b/src/layer/concat.cpp index e28fc6a20..a3dabdcec 100644 --- a/src/layer/concat.cpp +++ b/src/layer/concat.cpp @@ -264,6 +264,8 @@ int Concat::forward(const std::vector& bottom_blobs, std::vector& top_ int Concat::forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const { int dims = bottom_blobs[0].dims; + size_t elemsize = bottom_blobs[0].elemsize; + int packing = bottom_blobs[0].packing; if (dims == 1) // axis == 0 { @@ -277,10 +279,12 @@ int Concat::forward(const std::vector& bottom_blobs, std::vector& } VkMat& top_blob = top_blobs[0]; - top_blob.create(top_w, 4u, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(top_w, elemsize, packing, opt.blob_vkallocator, opt.staging_vkallocator); if (top_blob.empty()) return -100; + cmd.record_prepare_transfer_barrier(top_blob); + int dstOffset = 0; for (size_t b=0; b& bottom_blobs, std::vector& } VkMat& top_blob = top_blobs[0]; - top_blob.create(w, top_h, 4u, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(w, top_h, elemsize, packing, opt.blob_vkallocator, opt.staging_vkallocator); if (top_blob.empty()) return -100; + cmd.record_prepare_transfer_barrier(top_blob); + int dstOffset = 0; for (size_t b=0; b& bottom_blobs, std::vector& } VkMat& top_blob = top_blobs[0]; - top_blob.create(w, h, top_channels, 4u, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(w, h, top_channels, elemsize, packing, opt.blob_vkallocator, opt.staging_vkallocator); if (top_blob.empty()) return -100; + cmd.record_prepare_transfer_barrier(top_blob); + int dstOffset = 0; for (size_t b=0; b