| @@ -445,8 +445,26 @@ void VkCompute::record_compute_compute_barrier(const VkMat& m) | |||
| delayed_records.push_back(r); | |||
| } | |||
| void VkCompute::record_transfer_transfer_barrier(const VkMat& m) | |||
| { | |||
| m.state = 2; | |||
| if (vkdev->info.support_VK_KHR_push_descriptor) | |||
| return transfer_transfer_barrier(m.buffer(), m.buffer_offset(), m.total() * m.elemsize); | |||
| record_type r; | |||
| r.type = 10; | |||
| r.transfer_transfer_barrier.buffer = m.buffer(); | |||
| r.transfer_transfer_barrier.offset = m.buffer_offset(); | |||
| r.transfer_transfer_barrier.size = m.total() * m.elemsize; | |||
| delayed_records.push_back(r); | |||
| } | |||
| void VkCompute::record_prepare_transfer_barrier(const VkMat& m) | |||
| { | |||
| if (m.state == 2) | |||
| return record_transfer_transfer_barrier(m); | |||
| if (m.state == 3) | |||
| return record_compute_transfer_barrier(m); | |||
| @@ -470,7 +488,7 @@ int VkCompute::end() | |||
| return end_command_buffer(); | |||
| record_type r; | |||
| r.type = 10; | |||
| r.type = 11; | |||
| delayed_records.push_back(r); | |||
| return 0; | |||
| @@ -519,6 +537,9 @@ int VkCompute::submit() | |||
| compute_compute_barrier(r.compute_compute_barrier.buffer, r.compute_compute_barrier.offset, r.compute_compute_barrier.size); | |||
| break; | |||
| case 10: | |||
| transfer_transfer_barrier(r.compute_compute_barrier.buffer, r.compute_compute_barrier.offset, r.compute_compute_barrier.size); | |||
| break; | |||
| case 11: | |||
| end_command_buffer(); | |||
| break; | |||
| } | |||
| @@ -649,6 +670,27 @@ void VkCompute::compute_compute_barrier(VkBuffer buffer, size_t offset, size_t s | |||
| vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); | |||
| } | |||
| void VkCompute::transfer_transfer_barrier(VkBuffer buffer, size_t offset, size_t size) | |||
| { | |||
| // fprintf(stderr, "cmd transfer_transfer_barrier %p\n", buffer); | |||
| VkBufferMemoryBarrier bufferBarrier; | |||
| bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; | |||
| bufferBarrier.pNext = 0; | |||
| bufferBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; | |||
| bufferBarrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; | |||
| bufferBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; | |||
| bufferBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; | |||
| bufferBarrier.buffer = buffer; | |||
| bufferBarrier.offset = offset; | |||
| bufferBarrier.size = size; | |||
| VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; | |||
| VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; | |||
| vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); | |||
| } | |||
| VkTransfer::VkTransfer(VulkanDevice* _vkdev) : Command(_vkdev, _vkdev->info.transfer_queue_index) | |||
| { | |||
| staging_data = 0; | |||
| @@ -78,6 +78,8 @@ public: | |||
| void record_compute_compute_barrier(const VkMat& m); | |||
| void record_transfer_transfer_barrier(const VkMat& m); | |||
| void record_prepare_transfer_barrier(const VkMat& m); | |||
| void record_prepare_compute_barrier(const VkMat& m); | |||
| @@ -109,6 +111,7 @@ protected: | |||
| void transfer_compute_barrier(VkBuffer buffer, size_t offset, size_t size); | |||
| void compute_transfer_barrier(VkBuffer buffer, size_t offset, size_t size); | |||
| void compute_compute_barrier(VkBuffer buffer, size_t offset, size_t size); | |||
| void transfer_transfer_barrier(VkBuffer buffer, size_t offset, size_t size); | |||
| protected: | |||
| // delayed record | |||
| @@ -127,7 +130,8 @@ protected: | |||
| // 7=transfer-compute barrier | |||
| // 8=compute-transfer barrier | |||
| // 9=compute-compute barrier | |||
| // 10=end | |||
| // 10=transfer-transfer barrier | |||
| // 11=end | |||
| int type; | |||
| union | |||
| @@ -141,6 +145,7 @@ protected: | |||
| struct { VkBuffer buffer; size_t offset; size_t size; } transfer_compute_barrier; | |||
| struct { VkBuffer buffer; size_t offset; size_t size; } compute_transfer_barrier; | |||
| struct { VkBuffer buffer; size_t offset; size_t size; } compute_compute_barrier; | |||
| struct { VkBuffer buffer; size_t offset; size_t size; } transfer_transfer_barrier; | |||
| }; | |||
| std::vector<VkBufferCopy> regions; | |||
| @@ -264,6 +264,8 @@ int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_ | |||
| int Concat::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int dims = bottom_blobs[0].dims; | |||
| size_t elemsize = bottom_blobs[0].elemsize; | |||
| int packing = bottom_blobs[0].packing; | |||
| if (dims == 1) // axis == 0 | |||
| { | |||
| @@ -277,10 +279,12 @@ int Concat::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& | |||
| } | |||
| VkMat& top_blob = top_blobs[0]; | |||
| top_blob.create(top_w, 4u, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(top_w, elemsize, packing, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| cmd.record_prepare_transfer_barrier(top_blob); | |||
| int dstOffset = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||
| @@ -316,10 +320,12 @@ int Concat::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& | |||
| } | |||
| VkMat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, top_h, 4u, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(w, top_h, elemsize, packing, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| cmd.record_prepare_transfer_barrier(top_blob); | |||
| int dstOffset = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||
| @@ -361,10 +367,12 @@ int Concat::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& | |||
| } | |||
| VkMat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, h, top_channels, 4u, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(w, h, top_channels, elemsize, packing, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| cmd.record_prepare_transfer_barrier(top_blob); | |||
| int dstOffset = 0; | |||
| for (size_t b=0; b<bottom_blobs.size(); b++) | |||
| { | |||