diff --git a/docs/developer-guide/low-level-operation-api.md b/docs/developer-guide/low-level-operation-api.md index 703994890..e4d2f51ce 100644 --- a/docs/developer-guide/low-level-operation-api.md +++ b/docs/developer-guide/low-level-operation-api.md @@ -150,6 +150,17 @@ ncnn::VkWeightStagingBufferAllocator g_weight_staging_vkallocator(vkdev); ncnn::Layer* convolution = ncnn::create_layer("Convolution"); convolution->vkdev = vkdev; +// set option +ncnn::Option opt; +opt.lightmode = true; +opt.num_threads = 4; +opt.blob_allocator = 0; +opt.workspace_allocator = 0; +opt.vulkan_compute = true; +opt.blob_vkallocator = &g_blob_vkallocator; +opt.workspace_vkallocator = &g_blob_vkallocator; +opt.staging_vkallocator = &g_staging_vkallocator; + // load param { ncnn::ParamDict pd; @@ -171,76 +182,42 @@ ncnn::ModelBinFromMatArray mb(weights); convolution->load_model(mb); } -// upload model -{ -ncnn::VkTransfer cmd(vkdev); -cmd.weight_vkallocator = &g_weight_vkallocator; -cmd.staging_vkallocator = &g_weight_staging_vkallocator; - -convolution->upload_model(cmd); - -cmd.submit(); -cmd.wait(); - -g_weight_staging_vkallocator.clear(); -} - // create pipeline convolution->create_pipeline(opt); -// set default option +// upload model { -ncnn::Option opt = ncnn::get_default_option(); +ncnn::VkTransfer cmd(vkdev); -opt.lightmode = true; -opt.num_threads = 4; -opt.blob_allocator = 0; -opt.workspace_allocator = 0; +ncnn::Option opt_upload = opt; +opt_upload.blob_vkallocator = &g_weight_vkallocator; +opt_upload.workspace_vkallocator = &g_weight_vkallocator; +opt_upload.staging_vkallocator = &g_weight_staging_vkallocator; -opt.vulkan_compute = true; -opt.blob_vkallocator = &g_blob_vkallocator; -opt.workspace_vkallocator = &g_blob_vkallocator; -opt.staging_vkallocator = &g_staging_vkallocator; +convolution->upload_model(cmd, opt_upload); -ncnn::set_default_option(opt); +cmd.submit_and_wait(); } ncnn::Mat bottom = random_mat(w, h, inch); -ncnn::VkMat bottom_gpu; - -// copy bottom to bottom_gpu -{ -bottom_gpu.create_like(bottom, &g_blob_vkallocator, &g_staging_vkallocator); -bottom_gpu.prepare_staging_buffer(); -bottom_gpu.upload(bottom); -} - -ncnn::VkMat top_gpu; +ncnn::Mat top; // forward { ncnn::VkCompute cmd(vkdev); -cmd.record_upload(bottom_gpu); +ncnn::VkMat bottom_gpu; +cmd.record_upload(bottom, bottom_gpu, opt); +ncnn::VkMat top_gpu; convolution->forward(bottom_gpu, top_gpu, cmd, opt); -top_gpu.prepare_staging_buffer(); - -cmd.record_download(top_gpu); +cmd.record_download(top_gpu, top, opt); cmd.submit_and_wait(); } -ncnn::Mat top; - -// copy top_gpu to top -{ -top.create_like(top_gpu); -top_gpu.download(top); -} - convolution->destroy_pipeline(opt); delete convolution; diff --git a/src/allocator.cpp b/src/allocator.cpp index 1c7662d99..5b7569d5c 100644 --- a/src/allocator.cpp +++ b/src/allocator.cpp @@ -470,7 +470,8 @@ VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size) ptr->memory = buffer_blocks[i]->memory; ptr->capacity = aligned_size; ptr->mapped_ptr = buffer_blocks[i]->mapped_ptr; - ptr->state = 1; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; // adjust budgets if (budget_size == aligned_size) @@ -540,7 +541,8 @@ VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size) ptr->memory = block->memory; ptr->capacity = aligned_size; ptr->mapped_ptr = block->mapped_ptr; - ptr->state = 1; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; // adjust budgets std::list< std::pair > budget; @@ -715,7 +717,8 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size) ptr->memory = buffer_blocks[block_index]->memory; ptr->capacity = aligned_size; ptr->mapped_ptr = buffer_blocks[block_index]->mapped_ptr; - ptr->state = 1; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; buffer_block_free_spaces[block_index] -= aligned_size; @@ -790,7 +793,8 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size) ptr->memory = block->memory; ptr->capacity = new_block_size; ptr->mapped_ptr = block->mapped_ptr; - ptr->state = 1; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; return ptr; } @@ -841,7 +845,8 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size) ptr->memory = block->memory; ptr->capacity = aligned_size; ptr->mapped_ptr = block->mapped_ptr; - ptr->state = 1; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; return ptr; } @@ -940,7 +945,8 @@ VkBufferMemory* VkStagingBufferAllocator::fastMalloc(size_t size) vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr); - ptr->state = 1; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; // fprintf(stderr, "VkStagingBufferAllocator M %p %lu\n", ptr->buffer, size); @@ -989,7 +995,8 @@ VkBufferMemory* VkWeightStagingBufferAllocator::fastMalloc(size_t size) vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr); - ptr->state = 1; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; // fprintf(stderr, "VkWeightStagingBufferAllocator M %p %lu\n", ptr->buffer, size); @@ -1137,7 +1144,8 @@ VkImageMemory* VkSimpleImageAllocator::fastMalloc(int width, int height, VkForma ptr->imageview = create_imageview(ptr->image, format); - ptr->state = 1; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; return ptr; } @@ -1290,7 +1298,8 @@ VkImageMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(int /*width*/, ptr->image = image; ptr->memory = memory; ptr->imageview = imageview; - ptr->state = 1; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; return ptr; } diff --git a/src/allocator.h b/src/allocator.h index a95469dbd..a70b61c8f 100644 --- a/src/allocator.h +++ b/src/allocator.h @@ -192,12 +192,8 @@ public: void* mapped_ptr; // buffer state, modified by command functions internally - // 0=null - // 1=created - // 2=transfer - // 3=compute - // 4=readonly - mutable int state; + mutable VkAccessFlags access_flags; + mutable VkPipelineStageFlags stage_flags; // initialize and modified by mat int refcount; @@ -311,13 +307,9 @@ public: VkDeviceMemory memory; - // buffer state, modified by command functions internally - // 0=null - // 1=created - // 2=transfer - // 3=compute - // 4=readonly - mutable int state; + // image state, modified by command functions internally + mutable VkAccessFlags access_flags; + mutable VkPipelineStageFlags stage_flags; // initialize and modified by mat int refcount; diff --git a/src/command.cpp b/src/command.cpp index c2103a571..361401a9e 100644 --- a/src/command.cpp +++ b/src/command.cpp @@ -1,6 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // -// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at @@ -17,167 +17,24 @@ #if NCNN_VULKAN #include +#include #include "option.h" +#include "pipeline.h" namespace ncnn { -Command::Command(const VulkanDevice* _vkdev, uint32_t _queue_family_index) : vkdev(_vkdev), queue_family_index(_queue_family_index) +VkCompute::VkCompute(const VulkanDevice* _vkdev) : vkdev(_vkdev) { - create_command_pool(); + compute_command_pool = 0; + compute_command_buffer = 0; + compute_command_fence = 0; - create_command_buffer(); - - // create fence - VkFenceCreateInfo fenceCreateInfo; - fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; - fenceCreateInfo.pNext = 0; - fenceCreateInfo.flags = 0; - - VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &fence); - if (ret != VK_SUCCESS) - { - fprintf(stderr, "vkCreateFence failed %d\n", ret); - } -} - -Command::~Command() -{ - vkDestroyFence(vkdev->vkdevice(), fence, 0); - - vkFreeCommandBuffers(vkdev->vkdevice(), command_pool, 1, &command_buffer); - - vkDestroyCommandPool(vkdev->vkdevice(), command_pool, 0); -} - -int Command::create_command_pool() -{ - VkCommandPoolCreateInfo commandPoolCreateInfo; - commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; - commandPoolCreateInfo.pNext = 0; - commandPoolCreateInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; - commandPoolCreateInfo.queueFamilyIndex = queue_family_index; - - VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &command_pool); - if (ret != VK_SUCCESS) - { - fprintf(stderr, "vkCreateCommandPool failed %d\n", ret); - return -1; - } - - return 0; -} - -int Command::create_command_buffer() -{ - VkCommandBufferAllocateInfo commandBufferAllocateInfo; - commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; - commandBufferAllocateInfo.pNext = 0; - commandBufferAllocateInfo.commandPool = command_pool; - commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - commandBufferAllocateInfo.commandBufferCount = 1; - - VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &command_buffer); - if (ret != VK_SUCCESS) - { - fprintf(stderr, "vkAllocateCommandBuffers failed %d\n", ret); - return -1; - } - - return 0; -} - -int Command::begin_command_buffer() -{ -// fprintf(stderr, "==================== begin\n"); - - VkCommandBufferBeginInfo commandBufferBeginInfo; - commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; - commandBufferBeginInfo.pNext = 0; - commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - commandBufferBeginInfo.pInheritanceInfo = 0; - - VkResult ret = vkBeginCommandBuffer(command_buffer, &commandBufferBeginInfo); - if (ret != VK_SUCCESS) - { - fprintf(stderr, "vkBeginCommandBuffer failed %d\n", ret); - return -1; - } - - return 0; -} - -int Command::end_command_buffer() -{ -// fprintf(stderr, "==================== end\n"); - - VkResult ret = vkEndCommandBuffer(command_buffer); - if (ret != VK_SUCCESS) - { - fprintf(stderr, "vkEndCommandBuffer failed %d\n", ret); - return -1; - } - - return 0; -} - -int Command::queue_submit_and_wait_fence() -{ - // acquire queue and reclaim on return - VkQueue queue = vkdev->acquire_queue(queue_family_index); - if (queue == 0) - { - fprintf(stderr, "out of compute queue\n"); - return -1; - } - -// fprintf(stderr, "==================== submit\n"); - { - VkSubmitInfo submitInfo; - submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - submitInfo.pNext = 0; - submitInfo.waitSemaphoreCount = 0; - submitInfo.pWaitSemaphores = 0; - submitInfo.pWaitDstStageMask = 0; - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &command_buffer; - submitInfo.signalSemaphoreCount = 0; - submitInfo.pSignalSemaphores = 0; - - VkResult ret = vkQueueSubmit(queue, 1, &submitInfo, fence); - if (ret != VK_SUCCESS) - { - fprintf(stderr, "vkQueueSubmit failed %d\n", ret); - vkdev->reclaim_queue(queue_family_index, queue); - return -1; - } - } - - vkdev->reclaim_queue(queue_family_index, queue); - -// fprintf(stderr, "==================== wait\n"); - { - VkResult ret = vkWaitForFences(vkdev->vkdevice(), 1, &fence, VK_TRUE, UINT64_MAX); - if (ret != VK_SUCCESS) - { - fprintf(stderr, "vkWaitForFences failed %d\n", ret); - return -1; - } - } - - return 0; -} - -VkCompute::VkCompute(const VulkanDevice* _vkdev) : Command(_vkdev, _vkdev->info.compute_queue_family_index) -{ #if NCNN_BENCHMARK query_count = 0; query_pool = 0; #endif // NCNN_BENCHMARK - if (vkdev->info.support_VK_KHR_push_descriptor) - { - begin_command_buffer(); - } + init(); } VkCompute::~VkCompute() @@ -195,727 +52,1000 @@ VkCompute::~VkCompute() if (query_pool) { // all submitted commands that refer to queryPool must have completed execution - vkResetCommandBuffer(command_buffer, 0); + vkResetCommandBuffer(compute_command_buffer, 0); vkDestroyQueryPool(vkdev->vkdevice(), query_pool, 0); } #endif // NCNN_BENCHMARK -} -void VkCompute::record_upload(const VkMat& m) -{ - if (m.allocator->mappable) - return; - - record_prepare_transfer_barrier(m); + vkDestroyFence(vkdev->vkdevice(), compute_command_fence, 0); - if (vkdev->info.support_VK_KHR_push_descriptor) - return copy_buffer(m.staging_buffer(), m.staging_buffer_offset(), m.buffer(), m.buffer_offset(), m.total() * m.elemsize); - - record_type r; - r.type = 0; - r.copy.src = m.staging_buffer(); - r.copy.src_offset = m.staging_buffer_offset(); - r.copy.dst = m.buffer(); - r.copy.dst_offset = m.buffer_offset(); - r.copy.size = m.total() * m.elemsize; - delayed_records.push_back(r); + vkFreeCommandBuffers(vkdev->vkdevice(), compute_command_pool, 1, &compute_command_buffer); + vkDestroyCommandPool(vkdev->vkdevice(), compute_command_pool, 0); } -void VkCompute::record_download(const VkMat& m) +void VkCompute::record_upload(const Mat& src, VkMat& dst, const Option& opt) { - if (m.allocator->mappable) - { - record_prepare_host_barrier(m); - return; - } +// fprintf(stderr, "record_upload\n"); - record_prepare_transfer_barrier(m); + // create dst + dst.create_like(src, opt.blob_vkallocator); - if (vkdev->info.support_VK_KHR_push_descriptor) + if (dst.allocator->mappable) { - copy_buffer(m.buffer(), m.buffer_offset(), m.staging_buffer(), m.staging_buffer_offset(), m.total() * m.elemsize); - record_prepare_host_barrier(m); - return; - } - - record_type r; - r.type = 0; - r.copy.src = m.buffer(); - r.copy.src_offset = m.buffer_offset(); - r.copy.dst = m.staging_buffer(); - r.copy.dst_offset = m.staging_buffer_offset(); - r.copy.size = m.total() * m.elemsize; - delayed_records.push_back(r); - - record_prepare_host_barrier(m); -} - -void VkCompute::record_clone(const VkMat& src, const VkMat& dst) -{ - record_prepare_transfer_barrier(src); - record_prepare_transfer_barrier(dst); - - if (vkdev->info.support_VK_KHR_push_descriptor) - return copy_buffer(src.buffer(), src.buffer_offset(), dst.buffer(), dst.buffer_offset(), src.total() * src.elemsize); - - record_type r; - r.type = 0; - r.copy.src = src.buffer(); - r.copy.src_offset = src.buffer_offset(); - r.copy.dst = dst.buffer(); - r.copy.dst_offset = dst.buffer_offset(); - r.copy.size = src.total() * src.elemsize; - delayed_records.push_back(r); -} + // memcpy src to device + memcpy(dst.mapped_ptr(), src.data, src.total() * src.elemsize); + dst.allocator->flush(dst.data); -void VkCompute::record_copy_region(const VkMat& src, const VkMat& dst, const VkBufferCopy& region) -{ - std::vector regions(1); - regions[0] = region; + // mark device host-write @ null + dst.data->access_flags = VK_ACCESS_HOST_WRITE_BIT; + dst.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; - record_copy_regions(src, dst, regions); -} + return; + } -void VkCompute::record_copy_regions(const VkMat& src, const VkMat& dst, const std::vector& regions) -{ - record_prepare_transfer_barrier(src); - record_prepare_transfer_barrier(dst); + // create staging + VkMat dst_staging; + dst_staging.create_like(src, opt.staging_vkallocator); - if (vkdev->info.support_VK_KHR_push_descriptor) - return copy_buffer_regions(src.buffer(), dst.buffer(), regions); - - record_type r; - r.type = 1; - r.copy_regions.src = src.buffer(); - r.copy_regions.dst = dst.buffer(); - r.regions = regions; - delayed_records.push_back(r); -} + // memcpy src to staging + memcpy(dst_staging.mapped_ptr(), src.data, src.total() * src.elemsize); + dst_staging.allocator->flush(dst_staging.data); -void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector& bindings, const std::vector& constants, const VkMat& m) -{ - const int binding_count = bindings.size(); - for (int i=0; istate == 4) - continue; - - record_prepare_compute_barrier(bindings[i]); + VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; + barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].buffer = dst_staging.buffer(); + barriers[0].offset = dst_staging.buffer_offset(); + barriers[0].size = dst_staging.buffer_capacity(); + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_buffer_barrers; + r.command_buffer = compute_command_buffer; + r.buffer_barrers.src_stage = src_stage; + r.buffer_barrers.dst_stage = dst_stage; + r.buffer_barrers.barrier_count = 1; + r.buffer_barrers.barriers = barriers; + delayed_records.push_back(r); + } } - record_bind_pipeline(pipeline->pipeline); - - record_update_bindings(pipeline->pipeline_layout, pipeline->descriptorset_layout, pipeline->descriptor_update_template, bindings); - - record_push_constants(pipeline->pipeline_layout, constants); - - uint32_t group_count_xyz[3]; - group_count_xyz[0] = (m.w + pipeline->local_size_x - 1) / pipeline->local_size_x; - group_count_xyz[1] = (m.h + pipeline->local_size_y - 1) / pipeline->local_size_y; - group_count_xyz[2] = (m.c + pipeline->local_size_z - 1) / pipeline->local_size_z; - - record_dispatch(group_count_xyz); -} - -#if NCNN_BENCHMARK -void VkCompute::record_write_timestamp(uint32_t query) -{ - if (vkdev->info.support_VK_KHR_push_descriptor) - return write_timestamp(query); + // record staging to device + { + VkBufferCopy* regions = new VkBufferCopy[1]; + regions[0].srcOffset = dst_staging.buffer_offset(); + regions[0].dstOffset = dst.buffer_offset(); + regions[0].size = std::min(dst_staging.buffer_capacity(), dst.buffer_capacity()); - record_type r; - r.type = 10; - r.write_timestamp.query = query; - delayed_records.push_back(r); -} -#endif // NCNN_BENCHMARK + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdCopyBuffer(compute_command_buffer, dst_staging.buffer(), dst.buffer(), 1, regions); + delete[] regions; + } + else + { + record r; + r.type = record::TYPE_copy_buffer; + r.command_buffer = compute_command_buffer; + r.copy_buffer.src = dst_staging.buffer(); + r.copy_buffer.dst = dst.buffer(); + r.copy_buffer.region_count = 1; + r.copy_buffer.regions = regions; + delayed_records.push_back(r); + } + } -void VkCompute::record_queue_transfer_acquire(const VkMat& m, uint32_t src_queue_family_index) -{ - if (queue_family_index == src_queue_family_index) - return; + // mark device transfer-write @ queue + dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT; + dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; - if (vkdev->info.support_VK_KHR_push_descriptor) - return queue_transfer_acquire_barrier(m.buffer(), m.buffer_offset(), m.total() * m.elemsize, src_queue_family_index); - - record_type r; - r.type = 16; - r.queue_transfer_acquire_barrier.buffer = m.buffer(); - r.queue_transfer_acquire_barrier.offset = m.buffer_offset(); - r.queue_transfer_acquire_barrier.size = m.total() * m.elemsize; - r.queue_transfer_acquire_barrier.src_queue_family_index = src_queue_family_index; - delayed_records.push_back(r); + // stash staging + upload_staging_buffers.push_back(dst_staging); } -#if __ANDROID_API__ >= 26 -void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& im, const VkMat& m) +void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt) { - record_initial_image_compute_barrier(im); - - record_bind_pipeline(pipeline->pipeline); +// fprintf(stderr, "record_download\n"); - record_update_import_android_hardware_buffer_bindings(pipeline->pipeline_layout, pipeline->descriptorset_layout, pipeline->descriptor_update_template, pipeline->sampler, im, m); + // create dst + dst.create_like(src, opt.blob_allocator); - uint32_t group_count_xyz[3]; - group_count_xyz[0] = (m.w + 7) / 8; - group_count_xyz[1] = (m.h + 7) / 8; - group_count_xyz[2] = 1; - - record_dispatch(group_count_xyz); -} -#endif // __ANDROID_API__ >= 26 - -void VkCompute::record_bind_pipeline(VkPipeline pipeline) -{ - if (vkdev->info.support_VK_KHR_push_descriptor) - return bind_pipeline(pipeline); + if (src.allocator->mappable) + { + // barrier device any @ compute to host-read @ compute + if (src.data->access_flags != VK_ACCESS_HOST_READ_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_HOST_BIT) + { + VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = src.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_HOST_READ_BIT; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].buffer = src.buffer(); + barriers[0].offset = src.buffer_offset(); + barriers[0].size = src.buffer_capacity(); + + VkPipelineStageFlags src_stage = src.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_buffer_barrers; + r.command_buffer = compute_command_buffer; + r.buffer_barrers.src_stage = src_stage; + r.buffer_barrers.dst_stage = dst_stage; + r.buffer_barrers.barrier_count = 1; + r.buffer_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark device host-read @ any + src.data->access_flags = VK_ACCESS_HOST_READ_BIT; + src.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; + } - record_type r; - r.type = 2; - r.bind_pipeline.pipeline = pipeline; - delayed_records.push_back(r); -} + // stash download post buffer and mat + download_post_buffers.push_back(src); + download_post_mats.push_back(dst); -void VkCompute::record_update_bindings(VkPipelineLayout pipeline_layout, VkDescriptorSetLayout descriptorset_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const std::vector& bindings) -{ - const int binding_count = bindings.size(); + // post memcpy device to dst + { + record r; + r.type = record::TYPE_post_download; + r.command_buffer = 0; + r.post_download.download_post_buffer_mat_offset = download_post_buffers.size() - 1; + delayed_records.push_back(r); + } - if (binding_count == 0) return; - - std::vector descriptorBufferInfos(binding_count); - for (int i=0; iinfo.support_VK_KHR_push_descriptor) - return update_bindings(pipeline_layout, descriptor_update_template, descriptorBufferInfos); - - // create new descriptor_pool and descriptorset - VkDescriptorPool descriptor_pool; + if (src.data->access_flags != VK_ACCESS_TRANSFER_READ_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT) { - VkDescriptorPoolSize poolSize; - poolSize.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - poolSize.descriptorCount = binding_count; - - VkDescriptorPoolCreateInfo descriptorPoolCreateInfo; - descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; - descriptorPoolCreateInfo.pNext = 0; - descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; - descriptorPoolCreateInfo.maxSets = 1; - descriptorPoolCreateInfo.poolSizeCount = 1; - descriptorPoolCreateInfo.pPoolSizes = &poolSize; - - VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool); - if (ret != VK_SUCCESS) + // barrier device any @ compute to transfer-read @ compute + VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = src.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].buffer = src.buffer(); + barriers[0].offset = src.buffer_offset(); + barriers[0].size = src.buffer_capacity(); + + VkPipelineStageFlags src_stage = src.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) { - fprintf(stderr, "vkCreateDescriptorPool failed %d\n", ret); - return; + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); + delete[] barriers; } + else + { + record r; + r.type = record::TYPE_buffer_barrers; + r.command_buffer = compute_command_buffer; + r.buffer_barrers.src_stage = src_stage; + r.buffer_barrers.dst_stage = dst_stage; + r.buffer_barrers.barrier_count = 1; + r.buffer_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark device transfer-read @ transfer + src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT; + src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; } - descriptor_pools.push_back(descriptor_pool); - VkDescriptorSet descriptorset; + // create staging + VkMat src_staging; + src_staging.create_like(src, opt.staging_vkallocator); + + // record device to staging { - VkDescriptorSetAllocateInfo descriptorSetAllocateInfo; - descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; - descriptorSetAllocateInfo.pNext = 0; - descriptorSetAllocateInfo.descriptorPool = descriptor_pool; - descriptorSetAllocateInfo.descriptorSetCount = 1; - descriptorSetAllocateInfo.pSetLayouts = &descriptorset_layout; + VkBufferCopy* regions = new VkBufferCopy[1]; + regions[0].srcOffset = src.buffer_offset(); + regions[0].dstOffset = src_staging.buffer_offset(); + regions[0].size = std::min(src.buffer_capacity(), src_staging.buffer_capacity()); - VkResult ret = vkAllocateDescriptorSets(vkdev->vkdevice(), &descriptorSetAllocateInfo, &descriptorset); - if (ret != VK_SUCCESS) + if (vkdev->info.support_VK_KHR_push_descriptor) { - fprintf(stderr, "vkAllocateDescriptorSets failed %d\n", ret); - return; + vkCmdCopyBuffer(compute_command_buffer, src.buffer(), src_staging.buffer(), 1, regions); + delete[] regions; + } + else + { + record r; + r.type = record::TYPE_copy_buffer; + r.command_buffer = compute_command_buffer; + r.copy_buffer.src = src.buffer(); + r.copy_buffer.dst = src_staging.buffer(); + r.copy_buffer.region_count = 1; + r.copy_buffer.regions = regions; + delayed_records.push_back(r); } } - descriptorsets.push_back(descriptorset); - -// fprintf(stderr, "update descriptorset %p\n", descriptorset); - if (vkdev->info.support_VK_KHR_descriptor_update_template) + // barrier staging transfer-write @ compute to host-read @ compute { - vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, descriptor_update_template, descriptorBufferInfos.data()); - } - else - { - std::vector writeDescriptorSets(binding_count); - for (int i=0; iinfo.support_VK_KHR_push_descriptor) { - writeDescriptorSets[i].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[i].pNext = 0; - writeDescriptorSets[i].dstSet = descriptorset; - writeDescriptorSets[i].dstBinding = i; - writeDescriptorSets[i].dstArrayElement = 0; - writeDescriptorSets[i].descriptorCount = 1; - writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - writeDescriptorSets[i].pImageInfo = 0; - writeDescriptorSets[i].pBufferInfo = &descriptorBufferInfos[i]; - writeDescriptorSets[i].pTexelBufferView = 0; + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_buffer_barrers; + r.command_buffer = compute_command_buffer; + r.buffer_barrers.src_stage = src_stage; + r.buffer_barrers.dst_stage = dst_stage; + r.buffer_barrers.barrier_count = 1; + r.buffer_barrers.barriers = barriers; + delayed_records.push_back(r); } - - vkUpdateDescriptorSets(vkdev->vkdevice(), binding_count, writeDescriptorSets.data(), 0, 0); } - record_type r; - r.type = 3; - r.bind_descriptorset.pipeline_layout = pipeline_layout; - r.bind_descriptorset.descriptorset = descriptorset; - delayed_records.push_back(r); -} - -void VkCompute::record_push_constants(VkPipelineLayout pipeline_layout, const std::vector& constants) -{ - if (vkdev->info.support_VK_KHR_push_descriptor) - return push_constants(pipeline_layout, constants); - - record_type r; - r.type = 4; - r.push_constants.pipeline_layout = pipeline_layout; - r.constants = constants; - delayed_records.push_back(r); -} - -void VkCompute::record_dispatch(const uint32_t* group_count_xyz) -{ - if (vkdev->info.support_VK_KHR_push_descriptor) - return dispatch(group_count_xyz); - - record_type r; - r.type = 5; - r.dispatch.group_count_xyz[0] = group_count_xyz[0]; - r.dispatch.group_count_xyz[1] = group_count_xyz[1]; - r.dispatch.group_count_xyz[2] = group_count_xyz[2]; - delayed_records.push_back(r); -} - -void VkCompute::record_transfer_compute_barrier(const VkMat& m) -{ - m.data->state = 3; - - if (vkdev->info.support_VK_KHR_push_descriptor) - return transfer_compute_barrier(m.buffer(), m.buffer_offset(), m.total() * m.elemsize); - - record_type r; - r.type = 6; - r.transfer_compute_barrier.buffer = m.buffer(); - r.transfer_compute_barrier.offset = m.buffer_offset(); - r.transfer_compute_barrier.size = m.total() * m.elemsize; - delayed_records.push_back(r); -} - -void VkCompute::record_compute_transfer_barrier(const VkMat& m) -{ - m.data->state = 2; - - if (vkdev->info.support_VK_KHR_push_descriptor) - return compute_transfer_barrier(m.buffer(), m.buffer_offset(), m.total() * m.elemsize); - - record_type r; - r.type = 7; - r.compute_transfer_barrier.buffer = m.buffer(); - r.compute_transfer_barrier.offset = m.buffer_offset(); - r.compute_transfer_barrier.size = m.total() * m.elemsize; - delayed_records.push_back(r); -} - -void VkCompute::record_compute_compute_barrier(const VkMat& m) -{ - m.data->state = 3; - - if (vkdev->info.support_VK_KHR_push_descriptor) - return compute_compute_barrier(m.buffer(), m.buffer_offset(), m.total() * m.elemsize); - - record_type r; - r.type = 8; - r.compute_compute_barrier.buffer = m.buffer(); - r.compute_compute_barrier.offset = m.buffer_offset(); - r.compute_compute_barrier.size = m.total() * m.elemsize; - delayed_records.push_back(r); -} - -void VkCompute::record_transfer_transfer_barrier(const VkMat& m) -{ - m.data->state = 2; - - if (vkdev->info.support_VK_KHR_push_descriptor) - return transfer_transfer_barrier(m.buffer(), m.buffer_offset(), m.total() * m.elemsize); - - record_type r; - r.type = 9; - r.transfer_transfer_barrier.buffer = m.buffer(); - r.transfer_transfer_barrier.offset = m.buffer_offset(); - r.transfer_transfer_barrier.size = m.total() * m.elemsize; - delayed_records.push_back(r); -} - -void VkCompute::record_host_transfer_barrier(const VkMat& m) -{ - m.data->state = 2; - - if (!m.allocator->mappable && !m.staging_data) - return; - - VkBuffer buffer = m.allocator->mappable ? m.buffer() : m.staging_buffer(); - size_t buffer_offset = m.allocator->mappable ? m.buffer_offset() : m.staging_buffer_offset(); - - if (vkdev->info.support_VK_KHR_push_descriptor) - return host_transfer_barrier(buffer, buffer_offset, m.total() * m.elemsize); - - record_type r; - r.type = 12; - r.host_transfer_barrier.buffer = buffer; - r.host_transfer_barrier.offset = buffer_offset; - r.host_transfer_barrier.size = m.total() * m.elemsize; - delayed_records.push_back(r); -} - -void VkCompute::record_transfer_host_barrier(const VkMat& m) -{ - m.data->state = 1; - - if (!m.allocator->mappable && !m.staging_data) - return; + // stash download post buffer and mat + download_post_buffers.push_back(src_staging); + download_post_mats.push_back(dst); - VkBuffer buffer = m.allocator->mappable ? m.buffer() : m.staging_buffer(); - size_t buffer_offset = m.allocator->mappable ? m.buffer_offset() : m.staging_buffer_offset(); - - if (vkdev->info.support_VK_KHR_push_descriptor) - return transfer_host_barrier(buffer, buffer_offset, m.total() * m.elemsize); - - record_type r; - r.type = 13; - r.transfer_host_barrier.buffer = buffer; - r.transfer_host_barrier.offset = buffer_offset; - r.transfer_host_barrier.size = m.total() * m.elemsize; - delayed_records.push_back(r); + // post memcpy device to dst + { + record r; + r.type = record::TYPE_post_download; + r.command_buffer = 0; + r.post_download.download_post_buffer_mat_offset = download_post_buffers.size() - 1; + delayed_records.push_back(r); + } } -void VkCompute::record_host_compute_barrier(const VkMat& m) +void VkCompute::record_clone(const VkMat& src, VkMat& dst, const Option& opt) { - m.data->state = 3; - - if (!m.allocator->mappable && !m.staging_data) - return; +// fprintf(stderr, "record_clone\n"); - VkBuffer buffer = m.allocator->mappable ? m.buffer() : m.staging_buffer(); - size_t buffer_offset = m.allocator->mappable ? m.buffer_offset() : m.staging_buffer_offset(); + if (src.data->access_flags != VK_ACCESS_TRANSFER_READ_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT) + { + // barrier device any @ compute to transfer-read @ compute + VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = src.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].buffer = src.buffer(); + barriers[0].offset = src.buffer_offset(); + barriers[0].size = src.buffer_capacity(); + + VkPipelineStageFlags src_stage = src.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_buffer_barrers; + r.command_buffer = compute_command_buffer; + r.buffer_barrers.src_stage = src_stage; + r.buffer_barrers.dst_stage = dst_stage; + r.buffer_barrers.barrier_count = 1; + r.buffer_barrers.barriers = barriers; + delayed_records.push_back(r); + } - if (vkdev->info.support_VK_KHR_push_descriptor) - return host_compute_barrier(buffer, buffer_offset, m.total() * m.elemsize); - - record_type r; - r.type = 14; - r.host_compute_barrier.buffer = buffer; - r.host_compute_barrier.offset = buffer_offset; - r.host_compute_barrier.size = m.total() * m.elemsize; - delayed_records.push_back(r); -} + // mark device transfer-read @ transfer + src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT; + src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; + } -void VkCompute::record_compute_host_barrier(const VkMat& m) -{ - m.data->state = 1; + // create dst + dst.create_like(src, opt.blob_vkallocator); - if (!m.allocator->mappable && !m.staging_data) - return; + // record device to staging + { + VkBufferCopy* regions = new VkBufferCopy[1]; + regions[0].srcOffset = src.buffer_offset(); + regions[0].dstOffset = dst.buffer_offset(); + regions[0].size = std::min(src.buffer_capacity(), dst.buffer_capacity()); - VkBuffer buffer = m.allocator->mappable ? m.buffer() : m.staging_buffer(); - size_t buffer_offset = m.allocator->mappable ? m.buffer_offset() : m.staging_buffer_offset(); + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdCopyBuffer(compute_command_buffer, src.buffer(), dst.buffer(), 1, regions); + delete[] regions; + } + else + { + record r; + r.type = record::TYPE_copy_buffer; + r.command_buffer = compute_command_buffer; + r.copy_buffer.src = src.buffer(); + r.copy_buffer.dst = dst.buffer(); + r.copy_buffer.region_count = 1; + r.copy_buffer.regions = regions; + delayed_records.push_back(r); + } + } - if (vkdev->info.support_VK_KHR_push_descriptor) - return compute_host_barrier(buffer, buffer_offset, m.total() * m.elemsize); - - record_type r; - r.type = 15; - r.compute_host_barrier.buffer = buffer; - r.compute_host_barrier.offset = buffer_offset; - r.compute_host_barrier.size = m.total() * m.elemsize; - delayed_records.push_back(r); + // mark device transfer-write @ transfer + dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT; + dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT; } -void VkCompute::record_prepare_transfer_barrier(const VkMat& m) +void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector& bindings, const std::vector& constants, const VkMat& dispatcher) { - if (m.data->state == 1) - return record_host_transfer_barrier(m); - - if (m.data->state == 2) - return record_transfer_transfer_barrier(m); +// fprintf(stderr, "record_pipeline %p\n", pipeline); - if (m.data->state == 3) - return record_compute_transfer_barrier(m); + const size_t binding_count = bindings.size(); + const size_t constant_count = constants.size(); - m.data->state = 2; -} + for (size_t i=0; istate == 1) - return record_host_compute_barrier(m); + if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) + { + // barrier device any @ compute/null to shader-readwrite @ compute + VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = binding.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].buffer = binding.buffer(); + barriers[0].offset = binding.buffer_offset(); + barriers[0].size = binding.buffer_capacity(); + + VkPipelineStageFlags src_stage = binding.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_buffer_barrers; + r.command_buffer = compute_command_buffer; + r.buffer_barrers.src_stage = src_stage; + r.buffer_barrers.dst_stage = dst_stage; + r.buffer_barrers.barrier_count = 1; + r.buffer_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark device shader-readwrite @ compute + binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + } + } - if (m.data->state == 2) - return record_transfer_compute_barrier(m); + // record bind pipeline + { + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdBindPipeline(compute_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); + } + else + { + record r; + r.type = record::TYPE_bind_pipeline; + r.command_buffer = compute_command_buffer; + r.bind_pipeline.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; + r.bind_pipeline.pipeline = pipeline->pipeline; + delayed_records.push_back(r); + } + } - if (m.data->state == 3) - return record_compute_compute_barrier(m); + // record update bindings + if (binding_count > 0) + { + std::vector descriptorBufferInfos(binding_count); + for (size_t i=0; istate = 3; -} + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkdev->vkCmdPushDescriptorSetWithTemplateKHR(compute_command_buffer, pipeline->descriptor_update_template, pipeline->pipeline_layout, 0, descriptorBufferInfos.data()); + } + else + { + // create new descriptor_pool and descriptorset + VkDescriptorPool descriptor_pool; + { + VkDescriptorPoolSize poolSize; + poolSize.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + poolSize.descriptorCount = binding_count; + + VkDescriptorPoolCreateInfo descriptorPoolCreateInfo; + descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + descriptorPoolCreateInfo.pNext = 0; + descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; + descriptorPoolCreateInfo.maxSets = 1; + descriptorPoolCreateInfo.poolSizeCount = 1; + descriptorPoolCreateInfo.pPoolSizes = &poolSize; + + VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkCreateDescriptorPool failed %d\n", ret); + return; + } + } + descriptor_pools.push_back(descriptor_pool); + + VkDescriptorSet descriptorset; + { + VkDescriptorSetAllocateInfo descriptorSetAllocateInfo; + descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + descriptorSetAllocateInfo.pNext = 0; + descriptorSetAllocateInfo.descriptorPool = descriptor_pool; + descriptorSetAllocateInfo.descriptorSetCount = 1; + descriptorSetAllocateInfo.pSetLayouts = &pipeline->descriptorset_layout; + + VkResult ret = vkAllocateDescriptorSets(vkdev->vkdevice(), &descriptorSetAllocateInfo, &descriptorset); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkAllocateDescriptorSets failed %d\n", ret); + return; + } + } + descriptorsets.push_back(descriptorset); + + if (vkdev->info.support_VK_KHR_descriptor_update_template) + { + vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template, descriptorBufferInfos.data()); + } + else + { + std::vector writeDescriptorSets(binding_count); + for (size_t i=0; ivkdevice(), binding_count, writeDescriptorSets.data(), 0, 0); + } + + record r; + r.type = record::TYPE_bind_descriptorsets; + r.command_buffer = compute_command_buffer; + r.bind_descriptorsets.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; + r.bind_descriptorsets.pipeline_layout = pipeline->pipeline_layout; + r.bind_descriptorsets.descriptorset_count = 1; + r.bind_descriptorsets.descriptorset_offset = descriptorsets.size() - 1; + delayed_records.push_back(r); + } + } -void VkCompute::record_prepare_host_barrier(const VkMat& m) -{ - if (m.data->state == 2) - return record_transfer_host_barrier(m); + // record push constants + if (constant_count > 0) + { + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPushConstants(compute_command_buffer, pipeline->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, constant_count * sizeof(vk_constant_type), constants.data()); + } + else + { + uint32_t size = constant_count * sizeof(vk_constant_type); + unsigned char* constant_values = new unsigned char[size]; + memcpy(constant_values, constants.data(), size); + + record r; + r.type = record::TYPE_push_constants; + r.command_buffer = compute_command_buffer; + r.push_constants.pipeline_layout = pipeline->pipeline_layout; + r.push_constants.stage_flags = VK_SHADER_STAGE_COMPUTE_BIT; + r.push_constants.size = size; + r.push_constants.values = constant_values; + delayed_records.push_back(r); + } + } - if (m.data->state == 3) - return record_compute_host_barrier(m); + // record dispatch + { + uint32_t group_count_x = (dispatcher.w + pipeline->local_size_x - 1) / pipeline->local_size_x; + uint32_t group_count_y = (dispatcher.h + pipeline->local_size_y - 1) / pipeline->local_size_y; + uint32_t group_count_z = (dispatcher.c + pipeline->local_size_z - 1) / pipeline->local_size_z; - m.data->state = 1; + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdDispatch(compute_command_buffer, group_count_x, group_count_y, group_count_z); + } + else + { + record r; + r.type = record::TYPE_dispatch; + r.command_buffer = compute_command_buffer; + r.dispatch.group_count_x = group_count_x; + r.dispatch.group_count_y = group_count_y; + r.dispatch.group_count_z = group_count_z; + delayed_records.push_back(r); + } + } } -void VkCompute::record_initial_image_compute_barrier(const VkImageMat& im) +#if NCNN_BENCHMARK +void VkCompute::record_write_timestamp(uint32_t query) { if (vkdev->info.support_VK_KHR_push_descriptor) - return initial_image_compute_barrier(im.image()); - - record_type r; - r.type = 11; - r.initial_image_compute_barrier.image = im.image(); - delayed_records.push_back(r); + { + if (query_pool) + vkCmdWriteTimestamp(compute_command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, query_pool, query); + } + else + { + record r; + r.type = record::TYPE_write_timestamp; + r.command_buffer = compute_command_buffer; + r.write_timestamp.query = query; + delayed_records.push_back(r); + } } +#endif // NCNN_BENCHMARK #if __ANDROID_API__ >= 26 -void VkCompute::record_update_import_android_hardware_buffer_bindings(VkPipelineLayout pipeline_layout, VkDescriptorSetLayout descriptorset_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, VkSampler sampler, const VkImageMat& im, const VkMat& m) +void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst) { - VkDescriptorImageInfo descriptorImageInfo; - descriptorImageInfo.sampler = sampler; - descriptorImageInfo.imageView = im.imageview(); - descriptorImageInfo.imageLayout = VK_IMAGE_LAYOUT_GENERAL; - - VkDescriptorBufferInfo descriptorBufferInfo; - descriptorBufferInfo.buffer = m.buffer(); - descriptorBufferInfo.offset = m.buffer_offset(); - descriptorBufferInfo.range = m.total() * m.elemsize; - - if (vkdev->info.support_VK_KHR_push_descriptor) + // image layout transform undefined @ null to general @ compute { - return update_import_android_hardware_buffer_bindings(pipeline_layout, descriptor_update_template, descriptorImageInfo, descriptorBufferInfo); + VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = 0; + barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barriers[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + barriers[0].newLayout = VK_IMAGE_LAYOUT_GENERAL; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].image = src.image(); + barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barriers[0].subresourceRange.baseMipLevel = 0; + barriers[0].subresourceRange.levelCount = 1; + barriers[0].subresourceRange.baseArrayLayer = 0; + barriers[0].subresourceRange.layerCount = 1; + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_image_barrers; + r.command_buffer = compute_command_buffer; + r.image_barrers.src_stage = src_stage; + r.image_barrers.dst_stage = dst_stage; + r.image_barrers.barrier_count = 1; + r.image_barrers.barriers = barriers; + delayed_records.push_back(r); + } } - // create new descriptor_pool and descriptorset - VkDescriptorPool descriptor_pool; + // record bind pipeline { - VkDescriptorPoolSize poolSizes[2]; - poolSizes[0].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; - poolSizes[0].descriptorCount = 1; - poolSizes[1].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - poolSizes[1].descriptorCount = 1; - - VkDescriptorPoolCreateInfo descriptorPoolCreateInfo; - descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; - descriptorPoolCreateInfo.pNext = 0; - descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; - descriptorPoolCreateInfo.maxSets = 1; - descriptorPoolCreateInfo.poolSizeCount = 2; - descriptorPoolCreateInfo.pPoolSizes = poolSizes; - - VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool); - if (ret != VK_SUCCESS) + if (vkdev->info.support_VK_KHR_push_descriptor) { - fprintf(stderr, "vkCreateDescriptorPool failed %d\n", ret); - return; + vkCmdBindPipeline(compute_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); + } + else + { + record r; + r.type = record::TYPE_bind_pipeline; + r.command_buffer = compute_command_buffer; + r.bind_pipeline.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; + r.bind_pipeline.pipeline = pipeline->pipeline; + delayed_records.push_back(r); } } - descriptor_pools.push_back(descriptor_pool); - VkDescriptorSet descriptorset; + // record update bindings { - VkDescriptorSetAllocateInfo descriptorSetAllocateInfo; - descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; - descriptorSetAllocateInfo.pNext = 0; - descriptorSetAllocateInfo.descriptorPool = descriptor_pool; - descriptorSetAllocateInfo.descriptorSetCount = 1; - descriptorSetAllocateInfo.pSetLayouts = &descriptorset_layout; + VkDescriptorImageInfo descriptorImageInfo; + descriptorImageInfo.sampler = pipeline->sampler; + descriptorImageInfo.imageView = src.imageview(); + descriptorImageInfo.imageLayout = VK_IMAGE_LAYOUT_GENERAL; - VkResult ret = vkAllocateDescriptorSets(vkdev->vkdevice(), &descriptorSetAllocateInfo, &descriptorset); - if (ret != VK_SUCCESS) + VkDescriptorBufferInfo descriptorBufferInfo; + descriptorBufferInfo.buffer = dst.buffer(); + descriptorBufferInfo.offset = dst.buffer_offset(); + descriptorBufferInfo.range = dst.total() * dst.elemsize; + + if (vkdev->info.support_VK_KHR_push_descriptor) { - fprintf(stderr, "vkAllocateDescriptorSets failed %d\n", ret); - return; + struct ImportAndroidHardwareBufferDescriptorInfo + { + VkDescriptorImageInfo imageInfo; + VkDescriptorBufferInfo bufferInfo; + VkDescriptorBufferInfo buffer4Info; + }; + + ImportAndroidHardwareBufferDescriptorInfo info; + info.imageInfo = descriptorImageInfo; + info.bufferInfo = descriptorBufferInfo; + info.buffer4Info = descriptorBufferInfo; + + vkdev->vkCmdPushDescriptorSetWithTemplateKHR(compute_command_buffer, pipeline->descriptor_update_template, pipeline->pipeline_layout, 0, &info); + } + else + { + // create new descriptor_pool and descriptorset + VkDescriptorPool descriptor_pool; + { + VkDescriptorPoolSize poolSizes[2]; + poolSizes[0].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + poolSizes[0].descriptorCount = 1; + poolSizes[1].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + poolSizes[1].descriptorCount = 1; + + VkDescriptorPoolCreateInfo descriptorPoolCreateInfo; + descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + descriptorPoolCreateInfo.pNext = 0; + descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; + descriptorPoolCreateInfo.maxSets = 1; + descriptorPoolCreateInfo.poolSizeCount = 2; + descriptorPoolCreateInfo.pPoolSizes = poolSizes; + + VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkCreateDescriptorPool failed %d\n", ret); + return; + } + } + descriptor_pools.push_back(descriptor_pool); + + VkDescriptorSet descriptorset; + { + VkDescriptorSetAllocateInfo descriptorSetAllocateInfo; + descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + descriptorSetAllocateInfo.pNext = 0; + descriptorSetAllocateInfo.descriptorPool = descriptor_pool; + descriptorSetAllocateInfo.descriptorSetCount = 1; + descriptorSetAllocateInfo.pSetLayouts = &pipeline->descriptorset_layout; + + VkResult ret = vkAllocateDescriptorSets(vkdev->vkdevice(), &descriptorSetAllocateInfo, &descriptorset); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkAllocateDescriptorSets failed %d\n", ret); + return; + } + } + descriptorsets.push_back(descriptorset); + + if (vkdev->info.support_VK_KHR_descriptor_update_template) + { + struct ImportAndroidHardwareBufferDescriptorInfo + { + VkDescriptorImageInfo imageInfo; + VkDescriptorBufferInfo bufferInfo; + VkDescriptorBufferInfo buffer4Info; + }; + + ImportAndroidHardwareBufferDescriptorInfo info; + info.imageInfo = descriptorImageInfo; + info.bufferInfo = descriptorBufferInfo; + info.buffer4Info = descriptorBufferInfo; + + vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template, &info); + } + else + { + VkWriteDescriptorSet writeDescriptorSets[3]; + writeDescriptorSets[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writeDescriptorSets[0].pNext = 0; + writeDescriptorSets[0].dstSet = descriptorset; + writeDescriptorSets[0].dstBinding = 0; + writeDescriptorSets[0].dstArrayElement = 0; + writeDescriptorSets[0].descriptorCount = 1; + writeDescriptorSets[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + writeDescriptorSets[0].pImageInfo = &descriptorImageInfo; + writeDescriptorSets[0].pBufferInfo = 0; + writeDescriptorSets[0].pTexelBufferView = 0; + writeDescriptorSets[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writeDescriptorSets[1].pNext = 0; + writeDescriptorSets[1].dstSet = descriptorset; + writeDescriptorSets[1].dstBinding = 1; + writeDescriptorSets[1].dstArrayElement = 0; + writeDescriptorSets[1].descriptorCount = 1; + writeDescriptorSets[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + writeDescriptorSets[1].pImageInfo = 0; + writeDescriptorSets[1].pBufferInfo = &descriptorBufferInfo; + writeDescriptorSets[1].pTexelBufferView = 0; + writeDescriptorSets[2].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writeDescriptorSets[2].pNext = 0; + writeDescriptorSets[2].dstSet = descriptorset; + writeDescriptorSets[2].dstBinding = 2; + writeDescriptorSets[2].dstArrayElement = 0; + writeDescriptorSets[2].descriptorCount = 1; + writeDescriptorSets[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + writeDescriptorSets[2].pImageInfo = 0; + writeDescriptorSets[2].pBufferInfo = &descriptorBufferInfo; + writeDescriptorSets[2].pTexelBufferView = 0; + + vkUpdateDescriptorSets(vkdev->vkdevice(), 3, writeDescriptorSets, 0, 0); + } + + record r; + r.type = record::TYPE_bind_descriptorsets; + r.command_buffer = compute_command_buffer; + r.bind_descriptorsets.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; + r.bind_descriptorsets.pipeline_layout = pipeline->pipeline_layout; + r.bind_descriptorsets.descriptorset_count = 1; + r.bind_descriptorsets.descriptorset_offset = descriptorsets.size() - 1; + delayed_records.push_back(r); } } - descriptorsets.push_back(descriptorset); - -// fprintf(stderr, "update descriptorset %p\n", descriptorset); - if (vkdev->info.support_VK_KHR_descriptor_update_template) + // record dispatch { - struct ImportAndroidHardwareBufferDescriptorInfo - { - VkDescriptorImageInfo imageInfo; - VkDescriptorBufferInfo bufferInfo; - VkDescriptorBufferInfo buffer4Info; - }; + uint32_t group_count_x = (dst.w + pipeline->local_size_x - 1) / pipeline->local_size_x; + uint32_t group_count_y = (dst.h + pipeline->local_size_y - 1) / pipeline->local_size_y; + uint32_t group_count_z = (dst.c + pipeline->local_size_z - 1) / pipeline->local_size_z; - ImportAndroidHardwareBufferDescriptorInfo info; - info.imageInfo = descriptorImageInfo; - info.bufferInfo = descriptorBufferInfo; - info.buffer4Info = descriptorBufferInfo; - - vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, descriptor_update_template, &info); + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdDispatch(compute_command_buffer, group_count_x, group_count_y, group_count_z); + } + else + { + record r; + r.type = record::TYPE_dispatch; + r.command_buffer = compute_command_buffer; + r.dispatch.group_count_x = group_count_x; + r.dispatch.group_count_y = group_count_y; + r.dispatch.group_count_z = group_count_z; + delayed_records.push_back(r); + } } - else - { - VkWriteDescriptorSet writeDescriptorSets[3]; - writeDescriptorSets[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[0].pNext = 0; - writeDescriptorSets[0].dstSet = descriptorset; - writeDescriptorSets[0].dstBinding = 0; - writeDescriptorSets[0].dstArrayElement = 0; - writeDescriptorSets[0].descriptorCount = 1; - writeDescriptorSets[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; - writeDescriptorSets[0].pImageInfo = &descriptorImageInfo; - writeDescriptorSets[0].pBufferInfo = 0; - writeDescriptorSets[0].pTexelBufferView = 0; - writeDescriptorSets[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[1].pNext = 0; - writeDescriptorSets[1].dstSet = descriptorset; - writeDescriptorSets[1].dstBinding = 1; - writeDescriptorSets[1].dstArrayElement = 0; - writeDescriptorSets[1].descriptorCount = 1; - writeDescriptorSets[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - writeDescriptorSets[1].pImageInfo = 0; - writeDescriptorSets[1].pBufferInfo = &descriptorBufferInfo; - writeDescriptorSets[1].pTexelBufferView = 0; - writeDescriptorSets[2].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[2].pNext = 0; - writeDescriptorSets[2].dstSet = descriptorset; - writeDescriptorSets[2].dstBinding = 2; - writeDescriptorSets[2].dstArrayElement = 0; - writeDescriptorSets[2].descriptorCount = 1; - writeDescriptorSets[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - writeDescriptorSets[2].pImageInfo = 0; - writeDescriptorSets[2].pBufferInfo = &descriptorBufferInfo; - writeDescriptorSets[2].pTexelBufferView = 0; - - vkUpdateDescriptorSets(vkdev->vkdevice(), 3, writeDescriptorSets, 0, 0); - } - - record_type r; - r.type = 3; - r.bind_descriptorset.pipeline_layout = pipeline_layout; - r.bind_descriptorset.descriptorset = descriptorset; - delayed_records.push_back(r); } #endif // __ANDROID_API__ >= 26 int VkCompute::submit_and_wait() { - if (vkdev->info.support_VK_KHR_push_descriptor) +// fprintf(stderr, "submit_and_wait\n"); + + if (!vkdev->info.support_VK_KHR_push_descriptor) { - end_command_buffer(); + begin_command_buffer(); - return queue_submit_and_wait_fence(); - } +#if NCNN_BENCHMARK + if (query_pool) + vkCmdResetQueryPool(compute_command_buffer, query_pool, 0, query_count); +#endif // NCNN_BENCHMARK - begin_command_buffer(); + const size_t record_count = delayed_records.size(); + // handle delayed records + for (size_t i=0; iacquire_queue(vkdev->info.compute_queue_family_index); + if (compute_queue == 0) + { + fprintf(stderr, "out of compute queue\n"); + return -1; + } + + // submit compute + { + VkSubmitInfo submitInfo; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submitInfo.pNext = 0; + submitInfo.waitSemaphoreCount = 0; + submitInfo.pWaitSemaphores = 0; + submitInfo.pWaitDstStageMask = 0; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &compute_command_buffer; + submitInfo.signalSemaphoreCount = 0; + submitInfo.pSignalSemaphores = 0; + + VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, compute_command_fence); + if (ret != VK_SUCCESS) { - case 0: - copy_buffer(r.copy.src, r.copy.src_offset, r.copy.dst, r.copy.dst_offset, r.copy.size); - break; - case 1: - copy_buffer_regions(r.copy_regions.src, r.copy_regions.dst, r.regions); - break; - case 2: - bind_pipeline(r.bind_pipeline.pipeline); - break; - case 3: - bind_descriptorset(r.bind_descriptorset.pipeline_layout, r.bind_descriptorset.descriptorset); - break; - case 4: - push_constants(r.push_constants.pipeline_layout, r.constants); - break; - case 5: - dispatch(r.dispatch.group_count_xyz); - break; - case 6: - transfer_compute_barrier(r.transfer_compute_barrier.buffer, r.transfer_compute_barrier.offset, r.transfer_compute_barrier.size); - break; - case 7: - compute_transfer_barrier(r.compute_transfer_barrier.buffer, r.compute_transfer_barrier.offset, r.compute_transfer_barrier.size); - break; - case 8: - compute_compute_barrier(r.compute_compute_barrier.buffer, r.compute_compute_barrier.offset, r.compute_compute_barrier.size); - break; - case 9: - transfer_transfer_barrier(r.compute_compute_barrier.buffer, r.compute_compute_barrier.offset, r.compute_compute_barrier.size); - break; -#if NCNN_BENCHMARK - case 10: - write_timestamp(r.write_timestamp.query); - break; -#endif // NCNN_BENCHMARK - case 11: - initial_image_compute_barrier(r.initial_image_compute_barrier.image); - break; - case 12: - host_transfer_barrier(r.host_transfer_barrier.buffer, r.host_transfer_barrier.offset, r.host_transfer_barrier.size); - break; - case 13: - transfer_host_barrier(r.transfer_host_barrier.buffer, r.transfer_host_barrier.offset, r.transfer_host_barrier.size); - break; - case 14: - host_compute_barrier(r.host_compute_barrier.buffer, r.host_compute_barrier.offset, r.host_compute_barrier.size); - break; - case 15: - compute_host_barrier(r.compute_host_barrier.buffer, r.compute_host_barrier.offset, r.compute_host_barrier.size); + fprintf(stderr, "vkQueueSubmit failed %d\n", ret); + vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue); + return -1; + } + } + + vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue); + + // wait + { + VkResult ret = vkWaitForFences(vkdev->vkdevice(), 1, &compute_command_fence, VK_TRUE, UINT64_MAX); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkWaitForFences failed %d\n", ret); + return -1; + } + } + + // handle delayed post records + for (size_t i=0; iinvalidate(src.data); + memcpy(dst.data, src.mapped_ptr(), dst.total() * dst.elemsize); break; - case 16: - queue_transfer_acquire_barrier(r.queue_transfer_acquire_barrier.buffer, r.queue_transfer_acquire_barrier.offset, r.queue_transfer_acquire_barrier.size, r.queue_transfer_acquire_barrier.src_queue_family_index); + } + case record::TYPE_copy_buffer: + case record::TYPE_bind_pipeline: + case record::TYPE_bind_descriptorsets: + case record::TYPE_push_constants: + case record::TYPE_dispatch: + case record::TYPE_memory_barrers: + case record::TYPE_buffer_barrers: + case record::TYPE_image_barrers: + default: break; + } } - end_command_buffer(); - delayed_records.clear(); - return queue_submit_and_wait_fence(); + return 0; } int VkCompute::reset() { -// fprintf(stderr, "cmd reset\n"); - - VkResult ret = vkResetCommandBuffer(command_buffer, 0); - if (ret != VK_SUCCESS) + // reset command buffer and fence { - fprintf(stderr, "vkResetCommandBuffer failed %d\n", ret); - return -1; + VkResult ret = vkResetCommandBuffer(compute_command_buffer, 0); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkResetCommandBuffer failed %d\n", ret); + return -1; + } } - - ret = vkResetFences(vkdev->vkdevice(), 1, &fence); - if (ret != VK_SUCCESS) { - fprintf(stderr, "vkResetFences failed %d\n", ret); - return -1; + VkResult ret = vkResetFences(vkdev->vkdevice(), 1, &compute_command_fence); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkResetFences failed %d\n", ret); + return -1; + } } if (vkdev->info.support_VK_KHR_push_descriptor) @@ -923,7 +1053,8 @@ int VkCompute::reset() begin_command_buffer(); #if NCNN_BENCHMARK - reset_query_pool(); + if (query_pool) + vkCmdResetQueryPool(compute_command_buffer, query_pool, 0, query_count); #endif // NCNN_BENCHMARK } @@ -952,7 +1083,8 @@ int VkCompute::create_query_pool(uint32_t _query_count) if (vkdev->info.support_VK_KHR_push_descriptor) { - reset_query_pool(); + if (query_pool) + vkCmdResetQueryPool(compute_command_buffer, query_pool, 0, query_count); } return 0; @@ -978,324 +1110,138 @@ int VkCompute::get_query_pool_results(uint32_t first_query, uint32_t query_count } #endif // NCNN_BENCHMARK -void VkCompute::copy_buffer(VkBuffer src, size_t src_offset, VkBuffer dst, size_t dst_offset, size_t size) -{ -// fprintf(stderr, "cmd copy %p[+%lu] to %p[+%lu] %lu\n", src, src_offset, dst, dst_offset, size); - - VkBufferCopy region; - region.srcOffset = src_offset; - region.dstOffset = dst_offset; - region.size = size; - - vkCmdCopyBuffer(command_buffer, src, dst, 1, ®ion); -} - -void VkCompute::copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vector& regions) +int VkCompute::init() { -// fprintf(stderr, "cmd copy regions %p to %p\n", src, dst); - - vkCmdCopyBuffer(command_buffer, src, dst, regions.size(), regions.data()); -} - -void VkCompute::bind_pipeline(VkPipeline pipeline) -{ -// fprintf(stderr, "cmd bind_pipeline %p\n", pipeline); + // compute_command_pool + { + VkCommandPoolCreateInfo commandPoolCreateInfo; + commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + commandPoolCreateInfo.pNext = 0; + commandPoolCreateInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; + commandPoolCreateInfo.queueFamilyIndex = vkdev->info.compute_queue_family_index; - vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); -} + VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &compute_command_pool); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkCreateCommandPool failed %d\n", ret); + return -1; + } + } -void VkCompute::bind_descriptorset(VkPipelineLayout pipeline_layout, VkDescriptorSet descriptorset) -{ -// fprintf(stderr, "cmd bind_descriptorset %p %p\n", pipeline_layout, descriptorset); + // compute_command_buffer + { + VkCommandBufferAllocateInfo commandBufferAllocateInfo; + commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + commandBufferAllocateInfo.pNext = 0; + commandBufferAllocateInfo.commandPool = compute_command_pool; + commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + commandBufferAllocateInfo.commandBufferCount = 1; + + VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &compute_command_buffer); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkAllocateCommandBuffers failed %d\n", ret); + return -1; + } + } - vkCmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline_layout, 0, 1, &descriptorset, 0, 0); -} + // compute_command_fence + { + VkFenceCreateInfo fenceCreateInfo; + fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; + fenceCreateInfo.pNext = 0; + fenceCreateInfo.flags = 0; -void VkCompute::update_bindings(VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const std::vector& descriptorBufferInfos) -{ -// fprintf(stderr, "cmd update_bindings %p %p\n", pipeline_layout, descriptor_update_template); + VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &compute_command_fence); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkCreateFence failed %d\n", ret); + return -1; + } + } - vkdev->vkCmdPushDescriptorSetWithTemplateKHR(command_buffer, descriptor_update_template, pipeline_layout, 0, descriptorBufferInfos.data()); -} + if (vkdev->info.support_VK_KHR_push_descriptor) + { + begin_command_buffer(); -void VkCompute::push_constants(VkPipelineLayout pipeline_layout, const std::vector& constants) -{ -// fprintf(stderr, "cmd push_constants %p\n", pipeline_layout); +#if NCNN_BENCHMARK + if (query_pool) + vkCmdResetQueryPool(compute_command_buffer, query_pool, 0, query_count); +#endif // NCNN_BENCHMARK + } - vkCmdPushConstants(command_buffer, pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, constants.size() * sizeof(vk_constant_type), constants.data()); + return 0; } -void VkCompute::dispatch(const uint32_t* group_count_xyz) +int VkCompute::begin_command_buffer() { -// fprintf(stderr, "cmd dispatch %d %d %d\n", group_count_xyz[0], group_count_xyz[1], group_count_xyz[2]); + VkCommandBufferBeginInfo commandBufferBeginInfo; + commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + commandBufferBeginInfo.pNext = 0; + commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + commandBufferBeginInfo.pInheritanceInfo = 0; - vkCmdDispatch(command_buffer, group_count_xyz[0], group_count_xyz[1], group_count_xyz[2]); -} + VkResult ret = vkBeginCommandBuffer(compute_command_buffer, &commandBufferBeginInfo); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkBeginCommandBuffer failed %d\n", ret); + return -1; + } -void VkCompute::transfer_compute_barrier(VkBuffer buffer, size_t offset, size_t size) -{ -// fprintf(stderr, "cmd transfer_compute_barrier %p[+%lu] %lu\n", buffer, offset, size); - - VkBufferMemoryBarrier bufferBarrier; - bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - bufferBarrier.pNext = 0; - bufferBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - bufferBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; - bufferBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - bufferBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - bufferBarrier.buffer = buffer; - bufferBarrier.offset = offset; - bufferBarrier.size = size; - - VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; - VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - - vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); + return 0; } -void VkCompute::compute_transfer_barrier(VkBuffer buffer, size_t offset, size_t size) +int VkCompute::end_command_buffer() { -// fprintf(stderr, "cmd compute_transfer_barrier %p[+%lu] %lu\n", buffer, offset, size); - - VkBufferMemoryBarrier bufferBarrier; - bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - bufferBarrier.pNext = 0; - bufferBarrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; - bufferBarrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; - bufferBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - bufferBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - bufferBarrier.buffer = buffer; - bufferBarrier.offset = offset; - bufferBarrier.size = size; - - VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; - - vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); -} + VkResult ret = vkEndCommandBuffer(compute_command_buffer); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkEndCommandBuffer failed %d\n", ret); + return -1; + } -void VkCompute::compute_compute_barrier(VkBuffer buffer, size_t offset, size_t size) -{ -// fprintf(stderr, "cmd compute_compute_barrier %p[+%lu] %lu\n", buffer, offset, size); - - VkBufferMemoryBarrier bufferBarrier; - bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - bufferBarrier.pNext = 0; - bufferBarrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - bufferBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - bufferBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - bufferBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - bufferBarrier.buffer = buffer; - bufferBarrier.offset = offset; - bufferBarrier.size = size; - - VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - - vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); + return 0; } -void VkCompute::transfer_transfer_barrier(VkBuffer buffer, size_t offset, size_t size) +VkTransfer::VkTransfer(const VulkanDevice* _vkdev) : vkdev(_vkdev) { -// fprintf(stderr, "cmd transfer_transfer_barrier %p[+%lu] %lu\n", buffer, offset, size); - - VkBufferMemoryBarrier bufferBarrier; - bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - bufferBarrier.pNext = 0; - bufferBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - bufferBarrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; - bufferBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - bufferBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - bufferBarrier.buffer = buffer; - bufferBarrier.offset = offset; - bufferBarrier.size = size; - - VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; - VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; - - vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); -} + compute_command_pool = 0; + transfer_command_pool = 0; -void VkCompute::host_transfer_barrier(VkBuffer buffer, size_t offset, size_t size) -{ -// fprintf(stderr, "cmd host_transfer_barrier %p[+%lu] %lu\n", buffer, offset, size); - - VkBufferMemoryBarrier bufferBarrier; - bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - bufferBarrier.pNext = 0; - bufferBarrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; - bufferBarrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; - bufferBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - bufferBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - bufferBarrier.buffer = buffer; - bufferBarrier.offset = offset; - bufferBarrier.size = size; - - VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_HOST_BIT; - VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; - - vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); -} + upload_command_buffer = 0; + compute_command_buffer = 0; -void VkCompute::transfer_host_barrier(VkBuffer buffer, size_t offset, size_t size) -{ -// fprintf(stderr, "cmd transfer_host_barrier %p[+%lu] %lu\n", buffer, offset, size); - - VkBufferMemoryBarrier bufferBarrier; - bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - bufferBarrier.pNext = 0; - bufferBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - bufferBarrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT; - bufferBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - bufferBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - bufferBarrier.buffer = buffer; - bufferBarrier.offset = offset; - bufferBarrier.size = size; - - VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; - VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_HOST_BIT; - - vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); -} + upload_compute_semaphore = 0; -void VkCompute::host_compute_barrier(VkBuffer buffer, size_t offset, size_t size) -{ -// fprintf(stderr, "cmd host_compute_barrier %p[+%lu] %lu\n", buffer, offset, size); - - VkBufferMemoryBarrier bufferBarrier; - bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - bufferBarrier.pNext = 0; - bufferBarrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; - bufferBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; - bufferBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - bufferBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - bufferBarrier.buffer = buffer; - bufferBarrier.offset = offset; - bufferBarrier.size = size; - - VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_HOST_BIT; - VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - - vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); -} + upload_command_fence = 0; + compute_command_fence = 0; -void VkCompute::compute_host_barrier(VkBuffer buffer, size_t offset, size_t size) -{ -// fprintf(stderr, "cmd compute_host_barrier %p[+%lu] %lu\n", buffer, offset, size); - - VkBufferMemoryBarrier bufferBarrier; - bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - bufferBarrier.pNext = 0; - bufferBarrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; - bufferBarrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT; - bufferBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - bufferBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - bufferBarrier.buffer = buffer; - bufferBarrier.offset = offset; - bufferBarrier.size = size; - - VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_HOST_BIT; - - vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); + init(); } -void VkCompute::queue_transfer_acquire_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t src_queue_family_index) +VkTransfer::~VkTransfer() { -// fprintf(stderr, "cmd queue_transfer_acquire_barrier %p[+%lu] %lu %lu -> %lu\n", buffer, offset, size, src_queue_family_index, queue_family_index); - - VkBufferMemoryBarrier bufferBarrier; - bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - bufferBarrier.pNext = 0; - bufferBarrier.srcAccessMask = 0; - bufferBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - bufferBarrier.srcQueueFamilyIndex = src_queue_family_index; - bufferBarrier.dstQueueFamilyIndex = queue_family_index; - bufferBarrier.buffer = buffer; - bufferBarrier.offset = offset; - bufferBarrier.size = size; - - VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - - vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); -} + vkDestroyFence(vkdev->vkdevice(), compute_command_fence, 0); -void VkCompute::initial_image_compute_barrier(VkImage image) -{ -// fprintf(stderr, "cmd initial_image_compute_barrier %p %lu %lu\n", image, oldlayout, newlayout); - - VkImageMemoryBarrier imageBarrier; - imageBarrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - imageBarrier.pNext = 0; - imageBarrier.srcAccessMask = 0; - imageBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - imageBarrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; - imageBarrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; - imageBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - imageBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - imageBarrier.image = image; - imageBarrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - imageBarrier.subresourceRange.baseMipLevel = 0; - imageBarrier.subresourceRange.levelCount = 1; - imageBarrier.subresourceRange.baseArrayLayer = 0; - imageBarrier.subresourceRange.layerCount = 1; - - VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; - VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - - vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 0, 0, 1, &imageBarrier); -} + vkFreeCommandBuffers(vkdev->vkdevice(), compute_command_pool, 1, &compute_command_buffer); + vkDestroyCommandPool(vkdev->vkdevice(), compute_command_pool, 0); -#if __ANDROID_API__ >= 26 -void VkCompute::update_import_android_hardware_buffer_bindings(VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const VkDescriptorImageInfo& descriptorImageInfo, const VkDescriptorBufferInfo& descriptorBufferInfo) -{ - struct ImportAndroidHardwareBufferDescriptorInfo + if (!vkdev->info.unified_compute_transfer_queue) { - VkDescriptorImageInfo imageInfo; - VkDescriptorBufferInfo bufferInfo; - VkDescriptorBufferInfo buffer4Info; - }; - - ImportAndroidHardwareBufferDescriptorInfo info; - info.imageInfo = descriptorImageInfo; - info.bufferInfo = descriptorBufferInfo; - info.buffer4Info = descriptorBufferInfo; - - vkdev->vkCmdPushDescriptorSetWithTemplateKHR(command_buffer, descriptor_update_template, pipeline_layout, 0, &info); -} -#endif // __ANDROID_API__ >= 26 - -#if NCNN_BENCHMARK -void VkCompute::reset_query_pool() -{ -// fprintf(stderr, "cmd reset_query_pool\n"); - - if (query_pool) - vkCmdResetQueryPool(command_buffer, query_pool, 0, query_count); -} + vkDestroyFence(vkdev->vkdevice(), upload_command_fence, 0); -void VkCompute::write_timestamp(uint32_t query) -{ -// fprintf(stderr, "cmd write_timestamp %u\n", query); - - if (query_pool) - vkCmdWriteTimestamp(command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, query_pool, query); -} -#endif // NCNN_BENCHMARK - -VkTransfer::VkTransfer(const VulkanDevice* _vkdev) : Command(_vkdev, _vkdev->info.transfer_queue_family_index) -{ - buffer_offset_alignment = vkdev->info.buffer_offset_alignment; - staging_data = 0; -} + vkDestroySemaphore(vkdev->vkdevice(), upload_compute_semaphore, 0); -VkTransfer::~VkTransfer() -{ + vkFreeCommandBuffers(vkdev->vkdevice(), transfer_command_pool, 1, &upload_command_buffer); + vkDestroyCommandPool(vkdev->vkdevice(), transfer_command_pool, 0); + } } void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt) { +// fprintf(stderr, "record_upload src = %d | %d %d %d @ %d\n", src.dims, src.w, src.h, src.c, src.elempack); + + // NOTE keep the hack here ? if (src.elemsize / src.elempack == 4) { if (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0)) @@ -1311,147 +1257,468 @@ void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt) Mat src_flattened = src.reshape(src.w * src.h * src.c); - dst.create_like(src_flattened, weight_vkallocator, staging_vkallocator); + // create dst + dst.create_like(src_flattened, opt.blob_vkallocator); - // set weight blob as readonly - dst.data->state = 4; - - // we can skip queue transfer and staging buffer allocation - // only on unified memory architecture and unified compute/transfer queue - // which is usually the case on integrated gpu / cpu - if (dst.allocator->mappable && queue_family_index == vkdev->info.compute_queue_family_index) + if (dst.allocator->mappable) { - dst.upload(src_flattened); + // memcpy src_flattened to device + memcpy(dst.mapped_ptr(), src_flattened.data, src_flattened.total() * src_flattened.elemsize); + dst.allocator->flush(dst.data); + + // barrier device host-write @ null to shader-read @ compute + { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.buffer = dst.buffer(); + barrier.offset = dst.buffer_offset(); + barrier.size = dst.buffer_capacity(); + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); + } + + // mark device shader-readwrite @ compute + dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT; + dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; return; } - record_type r; - r.size = src_flattened.total() * src_flattened.elemsize; - r.mat = src_flattened; - r.vkmat = dst; - delayed_records.push_back(r); -} + // create staging + VkMat dst_staging; + dst_staging.create_like(src_flattened, opt.staging_vkallocator); -int VkTransfer::submit_and_wait() -{ - if (delayed_records.empty()) - return 0; + // memcpy src_flattened to staging + memcpy(dst_staging.mapped_ptr(), src_flattened.data, src_flattened.total() * src_flattened.elemsize); - int transfer_count = delayed_records.size(); + VkCommandBuffer command_buffer; + if (vkdev->info.unified_compute_transfer_queue) + { + command_buffer = compute_command_buffer; + } + else + { + command_buffer = upload_command_buffer; + } - // solve staging buffer size - size_t staging_buffer_size = 0; - for (int i=0; ifastMalloc(staging_buffer_size); + // record staging to device + { + VkBufferCopy region; + region.srcOffset = dst_staging.buffer_offset(); + region.dstOffset = dst.buffer_offset(); + region.size = std::min(dst_staging.buffer_capacity(), dst.buffer_capacity()); + + vkCmdCopyBuffer(command_buffer, dst_staging.buffer(), dst.buffer(), 1, ®ion); + } - // copy upload data - size_t mapped_ptr_offset = 0; - for (int i=0; iinfo.unified_compute_transfer_queue) + { + // barrier device transfer-write @ compute to shader-read @ compute + { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.buffer = dst.buffer(); + barrier.offset = dst.buffer_offset(); + barrier.size = dst.buffer_capacity(); + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); + } + } + else { - const record_type& r = delayed_records[i]; + // queue ownership transfer any @ transfer to shader-read @ compute - memcpy((unsigned char*)staging_data->mapped_ptr + mapped_ptr_offset, r.mat.data, r.size); + // release + { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = 0; + barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index; + barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index; + barrier.buffer = dst.buffer(); + barrier.offset = dst.buffer_offset(); + barrier.size = dst.buffer_capacity(); + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; + + vkCmdPipelineBarrier(upload_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); + } - mapped_ptr_offset += alignSize(r.size, buffer_offset_alignment); + // acquire + { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = 0; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index; + barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index; + barrier.buffer = dst.buffer(); + barrier.offset = dst.buffer_offset(); + barrier.size = dst.buffer_capacity(); + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); + } } - staging_vkallocator->flush(staging_data); + // mark device shader-readwrite @ compute + dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT; + dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - begin_command_buffer(); + // stash staging + upload_staging_buffers.push_back(dst_staging); +} + +int VkTransfer::submit_and_wait() +{ +// fprintf(stderr, "submit_and_wait\n"); + + // end command buffer + { + end_command_buffer(); + } -// fprintf(stderr, "cmd transfer %p %lu\n", staging_data->buffer, staging_buffer_size); + VkQueue compute_queue = vkdev->acquire_queue(vkdev->info.compute_queue_family_index); + if (compute_queue == 0) + { + fprintf(stderr, "out of compute queue\n"); + return -1; + } - // handle delayed records - size_t staging_buffer_offset = 0; - for (int i=0; iinfo.unified_compute_transfer_queue) + { + // submit compute + { + VkSubmitInfo submitInfo; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submitInfo.pNext = 0; + submitInfo.waitSemaphoreCount = 0; + submitInfo.pWaitSemaphores = 0; + submitInfo.pWaitDstStageMask = 0; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &compute_command_buffer; + submitInfo.signalSemaphoreCount = 0; + submitInfo.pSignalSemaphores = 0; + + VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, compute_command_fence); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkQueueSubmit failed %d\n", ret); + vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue); + return -1; + } + } + } + else { - const record_type& r = delayed_records[i]; + VkQueue transfer_queue = vkdev->acquire_queue(vkdev->info.transfer_queue_family_index); + if (transfer_queue == 0) + { + fprintf(stderr, "out of transfer queue\n"); + vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue); + return -1; + } - copy_buffer(staging_data->buffer, staging_buffer_offset, r.vkmat.buffer(), r.vkmat.buffer_offset(), r.size); + // submit upload compute + { + VkSubmitInfo submitInfo; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submitInfo.pNext = 0; + submitInfo.waitSemaphoreCount = 0; + submitInfo.pWaitSemaphores = 0; + submitInfo.pWaitDstStageMask = 0; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &upload_command_buffer; + submitInfo.signalSemaphoreCount = 1; + submitInfo.pSignalSemaphores = &upload_compute_semaphore; + + VkResult ret = vkQueueSubmit(transfer_queue, 1, &submitInfo, upload_command_fence); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkQueueSubmit failed %d\n", ret); + vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index, transfer_queue); + vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue); + return -1; + } + } + { + VkPipelineStageFlags wait_dst_stage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;// FIXME + + VkSubmitInfo submitInfo; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submitInfo.pNext = 0; + submitInfo.waitSemaphoreCount = 1; + submitInfo.pWaitSemaphores = &upload_compute_semaphore; + submitInfo.pWaitDstStageMask = &wait_dst_stage; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &compute_command_buffer; + submitInfo.signalSemaphoreCount = 0; + submitInfo.pSignalSemaphores = 0; + + VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, compute_command_fence); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkQueueSubmit failed %d\n", ret); + vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index, transfer_queue); + vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue); + return -1; + } + } - staging_buffer_offset += alignSize(r.size, buffer_offset_alignment); + vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index, transfer_queue); } - // owner transfer release - for (int i=0; ireclaim_queue(vkdev->info.compute_queue_family_index, compute_queue); - queue_transfer_release_barrier(r.vkmat.buffer(), r.vkmat.buffer_offset(), r.size, vkdev->info.compute_queue_family_index); + // wait + if (vkdev->info.unified_compute_transfer_queue) + { + VkResult ret = vkWaitForFences(vkdev->vkdevice(), 1, &compute_command_fence, VK_TRUE, UINT64_MAX); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkWaitForFences failed %d\n", ret); + return -1; + } } + else + { + VkFence fences[2] = { upload_command_fence, compute_command_fence }; - end_command_buffer(); + VkResult ret = vkWaitForFences(vkdev->vkdevice(), 2, fences, VK_TRUE, UINT64_MAX); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkWaitForFences failed %d\n", ret); + return -1; + } + } - int ret = queue_submit_and_wait_fence(); + return 0; +} - // compute queue owner transfer acquire +int VkTransfer::init() +{ + // compute_command_pool { - VkCompute cmd(vkdev); + VkCommandPoolCreateInfo commandPoolCreateInfo; + commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + commandPoolCreateInfo.pNext = 0; + commandPoolCreateInfo.flags = 0; + commandPoolCreateInfo.queueFamilyIndex = vkdev->info.compute_queue_family_index; - for (int i=0; ivkdevice(), &commandPoolCreateInfo, 0, &compute_command_pool); + if (ret != VK_SUCCESS) { - const record_type& r = delayed_records[i]; + fprintf(stderr, "vkCreateCommandPool failed %d\n", ret); + return -1; + } + } - cmd.record_queue_transfer_acquire(r.vkmat, queue_family_index); + // compute_command_buffer + { + VkCommandBufferAllocateInfo commandBufferAllocateInfo; + commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + commandBufferAllocateInfo.pNext = 0; + commandBufferAllocateInfo.commandPool = compute_command_pool; + commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + commandBufferAllocateInfo.commandBufferCount = 1; + + VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &compute_command_buffer); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkAllocateCommandBuffers failed %d\n", ret); + return -1; } + } + + // compute_command_fence + { + VkFenceCreateInfo fenceCreateInfo; + fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; + fenceCreateInfo.pNext = 0; + fenceCreateInfo.flags = 0; - cmd.submit_and_wait(); + VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &compute_command_fence); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkCreateFence failed %d\n", ret); + return -1; + } } - // deallocate staging buffer - staging_vkallocator->fastFree(staging_data); - staging_data = 0; + if (!vkdev->info.unified_compute_transfer_queue) + { + // transfer_command_pool + { + VkCommandPoolCreateInfo commandPoolCreateInfo; + commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + commandPoolCreateInfo.pNext = 0; + commandPoolCreateInfo.flags = 0; + commandPoolCreateInfo.queueFamilyIndex = vkdev->info.transfer_queue_family_index; + + VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &transfer_command_pool); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkCreateCommandPool failed %d\n", ret); + return -1; + } + } - delayed_records.clear(); + // upload_command_buffer + { + VkCommandBufferAllocateInfo commandBufferAllocateInfo; + commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + commandBufferAllocateInfo.pNext = 0; + commandBufferAllocateInfo.commandPool = transfer_command_pool; + commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + commandBufferAllocateInfo.commandBufferCount = 1; + + VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &upload_command_buffer); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkAllocateCommandBuffers failed %d\n", ret); + return -1; + } + } - return ret; -} + // upload_compute_semaphore + { + VkSemaphoreCreateInfo semaphoreCreateInfo; + semaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + semaphoreCreateInfo.pNext = 0; + semaphoreCreateInfo.flags = 0; + + VkResult ret = vkCreateSemaphore(vkdev->vkdevice(), &semaphoreCreateInfo, 0, &upload_compute_semaphore); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkCreateSemaphore failed %d\n", ret); + return -1; + } + } -void VkTransfer::copy_buffer(VkBuffer src, size_t src_offset, VkBuffer dst, size_t dst_offset, size_t size) -{ -// fprintf(stderr, "cmd copy %p to %p\n", src, dst); + // upload_command_fence + { + VkFenceCreateInfo fenceCreateInfo; + fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; + fenceCreateInfo.pNext = 0; + fenceCreateInfo.flags = 0; + + VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &upload_command_fence); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkCreateFence failed %d\n", ret); + return -1; + } + } + } - VkBufferCopy region; - region.srcOffset = src_offset; - region.dstOffset = dst_offset; - region.size = size; + begin_command_buffer(); - vkCmdCopyBuffer(command_buffer, src, dst, 1, ®ion); + return 0; } -void VkTransfer::copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vector& regions) +int VkTransfer::begin_command_buffer() { -// fprintf(stderr, "cmd copy regions %p to %p\n", src, dst); + { + VkCommandBufferBeginInfo commandBufferBeginInfo; + commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + commandBufferBeginInfo.pNext = 0; + commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + commandBufferBeginInfo.pInheritanceInfo = 0; + + VkResult ret = vkBeginCommandBuffer(compute_command_buffer, &commandBufferBeginInfo); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkBeginCommandBuffer failed %d\n", ret); + return -1; + } + } + + if (!vkdev->info.unified_compute_transfer_queue) + { + { + VkCommandBufferBeginInfo commandBufferBeginInfo; + commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + commandBufferBeginInfo.pNext = 0; + commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + commandBufferBeginInfo.pInheritanceInfo = 0; + + VkResult ret = vkBeginCommandBuffer(upload_command_buffer, &commandBufferBeginInfo); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkBeginCommandBuffer failed %d\n", ret); + return -1; + } + } + } - vkCmdCopyBuffer(command_buffer, src, dst, regions.size(), regions.data()); + return 0; } -void VkTransfer::queue_transfer_release_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t dst_queue_family_index) +int VkTransfer::end_command_buffer() { -// fprintf(stderr, "cmd queue_transfer_release_barrier %p[+%lu] %lu %lu -> %lu\n", buffer, offset, size, queue_family_index, dst_queue_family_index); - - VkBufferMemoryBarrier bufferBarrier; - bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - bufferBarrier.pNext = 0; - bufferBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - bufferBarrier.dstAccessMask = 0; - bufferBarrier.srcQueueFamilyIndex = queue_family_index; - bufferBarrier.dstQueueFamilyIndex = dst_queue_family_index; - bufferBarrier.buffer = buffer; - bufferBarrier.offset = offset; - bufferBarrier.size = size; - - VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; - VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; - - vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); + { + VkResult ret = vkEndCommandBuffer(compute_command_buffer); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkEndCommandBuffer failed %d\n", ret); + return -1; + } + } + + if (!vkdev->info.unified_compute_transfer_queue) + { + { + VkResult ret = vkEndCommandBuffer(upload_command_buffer); + if (ret != VK_SUCCESS) + { + fprintf(stderr, "vkEndCommandBuffer failed %d\n", ret); + return -1; + } + } + } + + return 0; } } // namespace ncnn diff --git a/src/command.h b/src/command.h index cc161cb2c..133e279b9 100644 --- a/src/command.h +++ b/src/command.h @@ -1,6 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // -// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at @@ -22,61 +22,31 @@ #include #include #include "mat.h" -#include "pipeline.h" namespace ncnn { -class Command -{ -public: - Command(const VulkanDevice* vkdev, uint32_t queue_family_index); - virtual ~Command(); - -protected: - int create_command_pool(); - int create_command_buffer(); - - // record issue - int begin_command_buffer(); - int end_command_buffer(); - int queue_submit_and_wait_fence(); - -protected: - const VulkanDevice* vkdev; - uint32_t queue_family_index; - - VkCommandPool command_pool; - VkCommandBuffer command_buffer; - - VkFence fence; -}; - -class VkCompute : public Command +class Pipeline; +class VkCompute { public: VkCompute(const VulkanDevice* vkdev); - ~VkCompute(); - - void record_upload(const VkMat& m); - - void record_download(const VkMat& m); + virtual ~VkCompute(); - void record_clone(const VkMat& src, const VkMat& dst); +public: + void record_upload(const Mat& src, VkMat& dst, const Option& opt); - void record_copy_region(const VkMat& src, const VkMat& dst, const VkBufferCopy& region); + void record_download(const VkMat& src, Mat& dst, const Option& opt); - void record_copy_regions(const VkMat& src, const VkMat& dst, const std::vector& regions); + void record_clone(const VkMat& src, VkMat& dst, const Option& opt); - void record_pipeline(const Pipeline* pipeline, const std::vector& bindings, const std::vector& constants, const VkMat& m); + void record_pipeline(const Pipeline* pipeline, const std::vector& bindings, const std::vector& constants, const VkMat& dispatcher); #if NCNN_BENCHMARK void record_write_timestamp(uint32_t query); #endif // NCNN_BENCHMARK - void record_queue_transfer_acquire(const VkMat& m, uint32_t src_queue_family_index); - #if __ANDROID_API__ >= 26 - void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& im, const VkMat& m); + void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst); #endif // __ANDROID_API__ >= 26 int submit_and_wait(); @@ -90,116 +60,73 @@ public: #endif // NCNN_BENCHMARK protected: - // record pipeline things - void record_bind_pipeline(VkPipeline pipeline); - void record_update_bindings(VkPipelineLayout pipeline_layout, VkDescriptorSetLayout descriptorset_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const std::vector& bindings); - void record_push_constants(VkPipelineLayout pipeline_layout, const std::vector& constants); - void record_dispatch(const uint32_t* group_count_xyz); - - // record barrier things - void record_transfer_compute_barrier(const VkMat& m); - void record_compute_transfer_barrier(const VkMat& m); - void record_compute_compute_barrier(const VkMat& m); - void record_transfer_transfer_barrier(const VkMat& m); - void record_host_transfer_barrier(const VkMat& m); - void record_transfer_host_barrier(const VkMat& m); - void record_host_compute_barrier(const VkMat& m); - void record_compute_host_barrier(const VkMat& m); - - // record prepare things - void record_prepare_transfer_barrier(const VkMat& m); - void record_prepare_compute_barrier(const VkMat& m); - void record_prepare_host_barrier(const VkMat& m); - - void record_initial_image_compute_barrier(const VkImageMat& im); + int init(); + int begin_command_buffer(); + int end_command_buffer(); -#if __ANDROID_API__ >= 26 - void record_update_import_android_hardware_buffer_bindings(VkPipelineLayout pipeline_layout, VkDescriptorSetLayout descriptorset_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, VkSampler sampler, const VkImageMat& im, const VkMat& m); -#endif // __ANDROID_API__ >= 26 +protected: + const VulkanDevice* vkdev; -#if NCNN_BENCHMARK - void reset_query_pool(); -#endif // NCNN_BENCHMARK + VkCommandPool compute_command_pool; -protected: - // recording issue - void copy_buffer(VkBuffer src, size_t src_offset, VkBuffer dst, size_t dst_offset, size_t size); - void copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vector& regions); - void bind_pipeline(VkPipeline pipeline); - void bind_descriptorset(VkPipelineLayout pipeline_layout, VkDescriptorSet descriptorset); - void update_bindings(VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const std::vector& descriptorBufferInfos); - void push_constants(VkPipelineLayout pipeline_layout, const std::vector& constants); - void dispatch(const uint32_t* group_count_xyz); - void transfer_compute_barrier(VkBuffer buffer, size_t offset, size_t size); - void compute_transfer_barrier(VkBuffer buffer, size_t offset, size_t size); - void compute_compute_barrier(VkBuffer buffer, size_t offset, size_t size); - void transfer_transfer_barrier(VkBuffer buffer, size_t offset, size_t size); - void host_transfer_barrier(VkBuffer buffer, size_t offset, size_t size); - void transfer_host_barrier(VkBuffer buffer, size_t offset, size_t size); - void host_compute_barrier(VkBuffer buffer, size_t offset, size_t size); - void compute_host_barrier(VkBuffer buffer, size_t offset, size_t size); - void queue_transfer_acquire_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t src_queue_family_index); - void initial_image_compute_barrier(VkImage image); -#if __ANDROID_API__ >= 26 - void update_import_android_hardware_buffer_bindings(VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const VkDescriptorImageInfo& descriptorImageInfo, const VkDescriptorBufferInfo& descriptorBufferInfo); -#endif // __ANDROID_API__ >= 26 -#if NCNN_BENCHMARK - void write_timestamp(uint32_t query); -#endif // NCNN_BENCHMARK + VkCommandBuffer compute_command_buffer; + + VkFence compute_command_fence; + + std::vector upload_staging_buffers; + std::vector download_post_buffers; + std::vector download_post_mats; -protected: - // delayed record // the good-old path for device without VK_KHR_push_descriptor std::vector descriptor_pools; std::vector descriptorsets; - struct record_type + + struct record { - // 0=copy - // 1=copy regions - // 2=bind pipeline - // 3=bind descriptorset - // 4=push constants - // 5=dispatch - // 6=transfer-compute barrier - // 7=compute-transfer barrier - // 8=compute-compute barrier - // 9=transfer-transfer barrier - // 10=write timestamp - // 11=initial image compute barrier - // 12=host-transfer barrier - // 13=transfer-host barrier - // 14=host-compute barrier - // 15=compute-host barrier - // 16=queue-transfer-acquire barrier + enum + { + TYPE_copy_buffer, + TYPE_bind_pipeline, + TYPE_bind_descriptorsets, + TYPE_push_constants, + TYPE_dispatch, + TYPE_memory_barrers, + TYPE_buffer_barrers, + TYPE_image_barrers, + +#if NCNN_BENCHMARK + TYPE_write_timestamp, +#endif // NCNN_BENCHMARK + + TYPE_post_download, + }; + int type; + VkCommandBuffer command_buffer; union { - struct { VkBuffer src; size_t src_offset; VkBuffer dst; size_t dst_offset; size_t size; } copy; - struct { VkBuffer src; VkBuffer dst; } copy_regions; - struct { VkPipeline pipeline; } bind_pipeline; - struct { VkPipelineLayout pipeline_layout; VkDescriptorSet descriptorset; } bind_descriptorset; - struct { VkPipelineLayout pipeline_layout; } push_constants; - struct { uint32_t group_count_xyz[3]; } dispatch; - struct { VkBuffer buffer; size_t offset; size_t size; } transfer_compute_barrier; - struct { VkBuffer buffer; size_t offset; size_t size; } compute_transfer_barrier; - struct { VkBuffer buffer; size_t offset; size_t size; } compute_compute_barrier; - struct { VkBuffer buffer; size_t offset; size_t size; } transfer_transfer_barrier; + struct { VkBuffer src; VkBuffer dst; uint32_t region_count; const VkBufferCopy* regions; } copy_buffer; + + struct { VkPipelineBindPoint bind_point; VkPipeline pipeline; } bind_pipeline; + struct { VkPipelineBindPoint bind_point; VkPipelineLayout pipeline_layout; uint32_t descriptorset_count; uint32_t descriptorset_offset; } bind_descriptorsets; + struct { VkPipelineLayout pipeline_layout; VkShaderStageFlags stage_flags; uint32_t size; const void* values; } push_constants; + + struct { uint32_t group_count_x; uint32_t group_count_y; uint32_t group_count_z; } dispatch; + + struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkMemoryBarrier* barriers; } memory_barrers; + struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkBufferMemoryBarrier* barriers; } buffer_barrers; + struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkImageMemoryBarrier* barriers; } image_barrers; + #if NCNN_BENCHMARK struct { uint32_t query; } write_timestamp; #endif // NCNN_BENCHMARK - struct { VkImage image; } initial_image_compute_barrier; - struct { VkBuffer buffer; size_t offset; size_t size; } host_transfer_barrier; - struct { VkBuffer buffer; size_t offset; size_t size; } transfer_host_barrier; - struct { VkBuffer buffer; size_t offset; size_t size; } host_compute_barrier; - struct { VkBuffer buffer; size_t offset; size_t size; } compute_host_barrier; - struct { VkBuffer buffer; size_t offset; size_t size; size_t src_queue_family_index; } queue_transfer_acquire_barrier; - }; - std::vector regions; - std::vector constants; + struct { uint32_t download_post_buffer_mat_offset; } post_download; + }; }; - std::vector delayed_records; + + std::vector delayed_records; #if NCNN_BENCHMARK uint32_t query_count; @@ -207,38 +134,37 @@ protected: #endif // NCNN_BENCHMARK }; -class VkTransfer : public Command +class VkTransfer { public: VkTransfer(const VulkanDevice* vkdev); ~VkTransfer(); +public: void record_upload(const Mat& src, VkMat& dst, const Option& opt); int submit_and_wait(); -public: - VkAllocator* weight_vkallocator; - VkAllocator* staging_vkallocator; - protected: - // recording issue - void copy_buffer(VkBuffer src, size_t src_offset, VkBuffer dst, size_t dst_offset, size_t size); - void copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vector& regions); - void queue_transfer_release_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t target_queue_family_index); + int init(); + int begin_command_buffer(); + int end_command_buffer(); protected: - size_t buffer_offset_alignment; - VkBufferMemory* staging_data; + const VulkanDevice* vkdev; - // delayed record - struct record_type - { - size_t size; - Mat mat; - VkMat vkmat; - }; - std::vector delayed_records; + VkCommandPool compute_command_pool; + VkCommandPool transfer_command_pool; + + VkCommandBuffer upload_command_buffer; + VkCommandBuffer compute_command_buffer; + + VkSemaphore upload_compute_semaphore; + + VkFence upload_command_fence; + VkFence compute_command_fence; + + std::vector upload_staging_buffers; }; } // namespace ncnn diff --git a/src/gpu.cpp b/src/gpu.cpp index 481594b5f..8d71753d4 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -628,6 +628,8 @@ int create_gpu_instance() gpu_info.graphics_queue_count = queueFamilyProperties[gpu_info.graphics_queue_family_index].queueCount; gpu_info.transfer_queue_count = queueFamilyProperties[gpu_info.transfer_queue_family_index].queueCount; + gpu_info.unified_compute_transfer_queue = gpu_info.compute_queue_family_index == gpu_info.transfer_queue_family_index; + // cache memory properties vkGetPhysicalDeviceMemoryProperties(physicalDevice, &gpu_info.physicalDeviceMemoryProperties); diff --git a/src/gpu.h b/src/gpu.h index 68f15ed20..28dd4fcaa 100644 --- a/src/gpu.h +++ b/src/gpu.h @@ -111,6 +111,9 @@ public: uint32_t graphics_queue_count; uint32_t transfer_queue_count; + // property + bool unified_compute_transfer_queue; + // bug is not feature bool bug_local_size_spec_const; diff --git a/src/layer.cpp b/src/layer.cpp index 39f080de8..340caaf05 100644 --- a/src/layer.cpp +++ b/src/layer.cpp @@ -121,11 +121,7 @@ int Layer::forward(const std::vector& bottom_blobs, std::vector& t top_blobs.resize(bottom_blobs.size()); for (int i = 0; i < (int)top_blobs.size(); i++) { - top_blobs[i].create_like(bottom_blobs[i], bottom_blobs[i].allocator, bottom_blobs[i].staging_allocator); - if (top_blobs[i].empty()) - return -100; - - cmd.record_clone(bottom_blobs[i], top_blobs[i]); + cmd.record_clone(bottom_blobs[i], top_blobs[i], opt); } return forward_inplace(top_blobs, cmd, opt); @@ -136,11 +132,7 @@ int Layer::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, co if (!support_inplace) return -1; - top_blob.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator); - if (top_blob.empty()) - return -100; - - cmd.record_clone(bottom_blob, top_blob); + cmd.record_clone(bottom_blob, top_blob, opt); return forward_inplace(top_blob, cmd, opt); } diff --git a/src/layer/vulkan/binaryop_vulkan.cpp b/src/layer/vulkan/binaryop_vulkan.cpp index 48a0ab9fb..29f3c267e 100644 --- a/src/layer/vulkan/binaryop_vulkan.cpp +++ b/src/layer/vulkan/binaryop_vulkan.cpp @@ -317,21 +317,21 @@ int BinaryOp_vulkan::forward(const std::vector& bottom_blobs, std::vector // broadcast if (bottom_blob.dims > bottom_blob1.dims) { - top_blob.create_like(bottom_blob, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create_like(bottom_blob, opt.blob_vkallocator); } else if (bottom_blob.dims < bottom_blob1.dims) { - top_blob.create_like(bottom_blob1, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create_like(bottom_blob1, opt.blob_vkallocator); } else // if (bottom_blob.dims == bottom_blob1.dims) { if (bottom_blob.w * bottom_blob.h * bottom_blob.c * bottom_blob.elempack >= bottom_blob1.w * bottom_blob1.h * bottom_blob1.c * bottom_blob1.elempack) { - top_blob.create_like(bottom_blob, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create_like(bottom_blob, opt.blob_vkallocator); } else { - top_blob.create_like(bottom_blob1, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create_like(bottom_blob1, opt.blob_vkallocator); } } if (top_blob.empty()) diff --git a/src/layer/vulkan/cast_vulkan.cpp b/src/layer/vulkan/cast_vulkan.cpp index 99ef08ecd..003319496 100644 --- a/src/layer/vulkan/cast_vulkan.cpp +++ b/src/layer/vulkan/cast_vulkan.cpp @@ -234,15 +234,15 @@ int Cast_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c if (dims == 1) { - top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator); } else if (dims == 2) { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator); } else if (dims == 3) { - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator); } if (top_blob.empty()) return -100; diff --git a/src/layer/vulkan/concat_vulkan.cpp b/src/layer/vulkan/concat_vulkan.cpp index 251ccb082..2baeb9847 100644 --- a/src/layer/vulkan/concat_vulkan.cpp +++ b/src/layer/vulkan/concat_vulkan.cpp @@ -312,14 +312,14 @@ int Concat_vulkan::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vectorforward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1); } - top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; @@ -515,7 +515,7 @@ int Crop_vulkan::forward(const std::vector& bottom_blobs, std::vector 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0)) { - top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator, opt.staging_vkallocator); + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); } else { - top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); } if (top_blob_bordered.empty()) return -100; @@ -528,8 +528,7 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC int wcut = top_blob_bordered_adj.w - output_w; int hcut = top_blob_bordered_adj.h - output_h; - VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); - crop_param_blob.prepare_staging_buffer(); + VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); int* crop_params = crop_param_blob.mapped(); if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233) diff --git a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp index f91053896..cd0f155cd 100644 --- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp +++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp @@ -597,11 +597,11 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_ VkMat top_blob_bordered; if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0)) { - top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator, opt.staging_vkallocator); + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); } else { - top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); } if (top_blob_bordered.empty()) return -100; @@ -681,8 +681,7 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_ int wcut = top_blob_bordered_adj.w - output_w; int hcut = top_blob_bordered_adj.h - output_h; - VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); - crop_param_blob.prepare_staging_buffer(); + VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); int* crop_params = crop_param_blob.mapped(); if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233) @@ -763,7 +762,7 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_ VkMat top_blob_unpacked = top_blob_bordered; if (out_elempack_g < out_elempack) { - top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator, opt.staging_vkallocator); + top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator); if (top_blob_unpacked.empty()) return -100; } @@ -883,8 +882,7 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_ int wcut = top_blob_bordered_adj.w - output_w; int hcut = top_blob_bordered_adj.h - output_h; - VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); - crop_param_blob.prepare_staging_buffer(); + VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); int* crop_params = crop_param_blob.mapped(); if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233) diff --git a/src/layer/vulkan/deepcopy_vulkan.cpp b/src/layer/vulkan/deepcopy_vulkan.cpp index 1672cfcd9..2edb60970 100644 --- a/src/layer/vulkan/deepcopy_vulkan.cpp +++ b/src/layer/vulkan/deepcopy_vulkan.cpp @@ -144,7 +144,7 @@ int DeepCopy_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkComput { int elempack = bottom_blob.elempack; - top_blob.create_like(bottom_blob, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create_like(bottom_blob, opt.blob_vkallocator); if (top_blob.empty()) return -100; diff --git a/src/layer/vulkan/eltwise_vulkan.cpp b/src/layer/vulkan/eltwise_vulkan.cpp index 07d053e09..146358ba1 100644 --- a/src/layer/vulkan/eltwise_vulkan.cpp +++ b/src/layer/vulkan/eltwise_vulkan.cpp @@ -157,7 +157,7 @@ int Eltwise_vulkan::forward(const std::vector& bottom_blobs, std::vector< int elempack = bottom_blob.elempack; VkMat& top_blob = top_blobs[0]; - top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; diff --git a/src/layer/vulkan/flatten_vulkan.cpp b/src/layer/vulkan/flatten_vulkan.cpp index cd4810ef5..25e54efd0 100644 --- a/src/layer/vulkan/flatten_vulkan.cpp +++ b/src/layer/vulkan/flatten_vulkan.cpp @@ -205,7 +205,7 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute return 0; } - top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; diff --git a/src/layer/vulkan/innerproduct_vulkan.cpp b/src/layer/vulkan/innerproduct_vulkan.cpp index 77b4f802e..d95223bd5 100644 --- a/src/layer/vulkan/innerproduct_vulkan.cpp +++ b/src/layer/vulkan/innerproduct_vulkan.cpp @@ -306,7 +306,7 @@ int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo if (out_elempack == 1) out_elemsize = 4u; } - top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; diff --git a/src/layer/vulkan/instancenorm_vulkan.cpp b/src/layer/vulkan/instancenorm_vulkan.cpp index 530176647..6294b6928 100644 --- a/src/layer/vulkan/instancenorm_vulkan.cpp +++ b/src/layer/vulkan/instancenorm_vulkan.cpp @@ -380,7 +380,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, int elempack = bottom_top_blob.elempack; // mean - VkMat mean_workspace(c, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); + VkMat mean_workspace(c, elemsize, elempack, opt.workspace_vkallocator); { // reduce sum VkMat sum_workspace; @@ -389,7 +389,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, int reduced_h = 1; int reduced_c = bottom_top_blob.c; - sum_workspace.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); + sum_workspace.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator); { std::vector bindings(2); bindings[0] = bottom_top_blob; @@ -419,7 +419,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, int reduced_c = sum_workspace.c; VkMat sum_workspace_reduced; - sum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); + sum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator); { std::vector bindings(2); @@ -466,11 +466,11 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, } // var - VkMat var_workspace(c, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); + VkMat var_workspace(c, elemsize, elempack, opt.workspace_vkallocator); { // sub mean and square VkMat square_workspace; - square_workspace.create(w, h, c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); + square_workspace.create(w, h, c, 4u*elempack, elempack, opt.workspace_vkallocator); { std::vector bindings(3); bindings[0] = bottom_top_blob; @@ -509,7 +509,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, int reduced_c = sqsum_workspace.c; VkMat sqsum_workspace_reduced; - sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); + sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator); { std::vector bindings(2); @@ -557,7 +557,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, // coeffs VkMat coeffs_workspace; - coeffs_workspace.create(c, elemsize * 2, elempack * 2, opt.workspace_vkallocator, opt.staging_vkallocator); + coeffs_workspace.create(c, elemsize * 2, elempack * 2, opt.workspace_vkallocator); { std::vector bindings(5); bindings[0] = coeffs_workspace; diff --git a/src/layer/vulkan/interp_vulkan.cpp b/src/layer/vulkan/interp_vulkan.cpp index e79a1f45a..b4ad74f19 100644 --- a/src/layer/vulkan/interp_vulkan.cpp +++ b/src/layer/vulkan/interp_vulkan.cpp @@ -274,7 +274,7 @@ int Interp_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& return 0; } - top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; @@ -306,11 +306,11 @@ int Interp_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& } else if (resize_type == 3) // bicubic { - VkMat alpha(outw, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator, opt.staging_vkallocator); + VkMat alpha(outw, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator); if (alpha.empty()) return -100; - VkMat xofs(outw, (size_t)4u, 1, opt.workspace_vkallocator, opt.staging_vkallocator); + VkMat xofs(outw, (size_t)4u, 1, opt.workspace_vkallocator); if (xofs.empty()) return -100; @@ -328,11 +328,11 @@ int Interp_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd.record_pipeline(pipeline_interp_bicubic_coeffs_x, bindings, constants, alpha); } - VkMat beta(outh, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator, opt.staging_vkallocator); + VkMat beta(outh, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator); if (beta.empty()) return -100; - VkMat yofs(outh, (size_t)4u, 1, opt.workspace_vkallocator, opt.staging_vkallocator); + VkMat yofs(outh, (size_t)4u, 1, opt.workspace_vkallocator); if (yofs.empty()) return -100; diff --git a/src/layer/vulkan/lrn_vulkan.cpp b/src/layer/vulkan/lrn_vulkan.cpp index ab66ae0b0..b9454c259 100644 --- a/src/layer/vulkan/lrn_vulkan.cpp +++ b/src/layer/vulkan/lrn_vulkan.cpp @@ -254,11 +254,11 @@ int LRN_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Op if (region_type == NormRegion_ACROSS_CHANNELS) { // always create scalar square workspace blob for norm across channel - square_workspace.create(w, h, channels * elempack + local_size - 1, 4u, 1, opt.workspace_vkallocator, opt.staging_vkallocator); + square_workspace.create(w, h, channels * elempack + local_size - 1, 4u, 1, opt.workspace_vkallocator); } else if (region_type == NormRegion_WITHIN_CHANNEL) { - square_workspace.create(w + local_size - 1, h + local_size - 1, channels, elempack * 4u, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); + square_workspace.create(w + local_size - 1, h + local_size - 1, channels, elempack * 4u, elempack, opt.workspace_vkallocator); } // square pad diff --git a/src/layer/vulkan/normalize_vulkan.cpp b/src/layer/vulkan/normalize_vulkan.cpp index 0bbac00d1..004844097 100644 --- a/src/layer/vulkan/normalize_vulkan.cpp +++ b/src/layer/vulkan/normalize_vulkan.cpp @@ -298,7 +298,7 @@ int Normalize_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co reduced_c = (bottom_top_blob.c + 3) / 4; } - sqsum_workspace.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); + sqsum_workspace.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator); { std::vector bindings(2); bindings[0] = bottom_top_blob; @@ -347,7 +347,7 @@ int Normalize_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co } VkMat sqsum_workspace_reduced; - sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); + sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator); { std::vector bindings(2); @@ -377,7 +377,7 @@ int Normalize_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co // coeffs VkMat coeffs_workspace; - coeffs_workspace.create(sqsum_workspace.w * sqsum_workspace.h * sqsum_workspace.c, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); + coeffs_workspace.create(sqsum_workspace.w * sqsum_workspace.h * sqsum_workspace.c, elemsize, elempack, opt.workspace_vkallocator); { std::vector bindings(2); bindings[0] = sqsum_workspace; diff --git a/src/layer/vulkan/packing_vulkan.cpp b/src/layer/vulkan/packing_vulkan.cpp index 327d9e597..8df5cc70b 100644 --- a/src/layer/vulkan/packing_vulkan.cpp +++ b/src/layer/vulkan/packing_vulkan.cpp @@ -203,7 +203,7 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute if (out_elempack == 1) out_elemsize = 4u; } - top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; } @@ -219,7 +219,7 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute if (out_elempack == 1) out_elemsize = 4u; } - top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; } @@ -235,7 +235,7 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute if (out_elempack == 1) out_elemsize = 4u; } - top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; } diff --git a/src/layer/vulkan/padding_vulkan.cpp b/src/layer/vulkan/padding_vulkan.cpp index 191b46cc7..f614bcc0a 100644 --- a/src/layer/vulkan/padding_vulkan.cpp +++ b/src/layer/vulkan/padding_vulkan.cpp @@ -170,7 +170,7 @@ int Padding_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute int outw = w + left + right; int outh = h + top + bottom; - top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; @@ -239,7 +239,7 @@ int Padding_vulkan::forward(const std::vector& bottom_blobs, std::vector< int outw = w + _left + _right; int outh = h + _top + _bottom; - top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; diff --git a/src/layer/vulkan/permute_vulkan.cpp b/src/layer/vulkan/permute_vulkan.cpp index 111fb1efe..5744d0c4f 100644 --- a/src/layer/vulkan/permute_vulkan.cpp +++ b/src/layer/vulkan/permute_vulkan.cpp @@ -270,7 +270,7 @@ int Permute_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute if (out_elempack == 1) out_elemsize = 4u; } - top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; } @@ -329,7 +329,7 @@ int Permute_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute if (out_elempack == 1) out_elemsize = 4u; } - top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; } diff --git a/src/layer/vulkan/pixelshuffle_vulkan.cpp b/src/layer/vulkan/pixelshuffle_vulkan.cpp index a4b64d010..b6974dafd 100644 --- a/src/layer/vulkan/pixelshuffle_vulkan.cpp +++ b/src/layer/vulkan/pixelshuffle_vulkan.cpp @@ -200,7 +200,7 @@ int PixelShuffle_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo if (out_elempack == 1) out_elemsize = 4u; } - top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; diff --git a/src/layer/vulkan/pooling_vulkan.cpp b/src/layer/vulkan/pooling_vulkan.cpp index b4c0e6f49..fd31230af 100644 --- a/src/layer/vulkan/pooling_vulkan.cpp +++ b/src/layer/vulkan/pooling_vulkan.cpp @@ -287,7 +287,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute if (global_pooling) { - top_blob.create(channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(channels, elemsize, elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; @@ -295,7 +295,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute bindings[0] = bottom_blob; bindings[1] = top_blob; - std::vector constants(12); + std::vector constants(10); constants[0].i = bottom_blob.dims; constants[1].i = bottom_blob.w; constants[2].i = bottom_blob.h; @@ -306,8 +306,6 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute constants[7].i = top_blob.h; constants[8].i = top_blob.c; constants[9].i = top_blob.cstep; - constants[10].i = 0; - constants[11].i = 0; const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_global_pack8 : elempack == 4 ? pipeline_pooling_global_pack4 @@ -336,8 +334,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute Option opt_pad = opt; opt_pad.blob_vkallocator = opt.workspace_vkallocator; - VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); - padding_param_blob.prepare_staging_buffer(); + VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); int* padding_params = padding_param_blob.mapped(); padding_params[0] = pad_top; @@ -369,8 +366,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute Option opt_pad = opt; opt_pad.blob_vkallocator = opt.workspace_vkallocator; - VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); - padding_param_blob.prepare_staging_buffer(); + VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); int* padding_params = padding_param_blob.mapped(); padding_params[0] = hpad / 2; @@ -396,8 +392,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute Option opt_pad = opt; opt_pad.blob_vkallocator = opt.workspace_vkallocator; - VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); - padding_param_blob.prepare_staging_buffer(); + VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); int* padding_params = padding_param_blob.mapped(); padding_params[0] = hpad - hpad / 2; @@ -421,7 +416,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute int outw = (w - kernel_w) / stride_w + 1; int outh = (h - kernel_h) / stride_h + 1; - top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; diff --git a/src/layer/vulkan/priorbox_vulkan.cpp b/src/layer/vulkan/priorbox_vulkan.cpp index cdb95e0d9..2701b154a 100644 --- a/src/layer/vulkan/priorbox_vulkan.cpp +++ b/src/layer/vulkan/priorbox_vulkan.cpp @@ -163,7 +163,7 @@ int PriorBox_vulkan::forward(const std::vector& bottom_blobs, std::vector } VkMat& top_blob = top_blobs[0]; - top_blob.create(4 * w * h * num_prior / elempack, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(4 * w * h * num_prior / elempack, elemsize, elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; @@ -217,7 +217,7 @@ int PriorBox_vulkan::forward(const std::vector& bottom_blobs, std::vector } VkMat& top_blob = top_blobs[0]; - top_blob.create(4 * w * h * num_prior, 2, elemsize, 1, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(4 * w * h * num_prior, 2, elemsize, 1, opt.blob_vkallocator); if (top_blob.empty()) return -100; diff --git a/src/layer/vulkan/reorg_vulkan.cpp b/src/layer/vulkan/reorg_vulkan.cpp index 9c7a48cb9..b77d27a87 100644 --- a/src/layer/vulkan/reorg_vulkan.cpp +++ b/src/layer/vulkan/reorg_vulkan.cpp @@ -192,7 +192,7 @@ int Reorg_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& if (out_elempack == 1) out_elemsize = 4u; } - top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; diff --git a/src/layer/vulkan/reshape_vulkan.cpp b/src/layer/vulkan/reshape_vulkan.cpp index 6031b53db..c71b063d0 100644 --- a/src/layer/vulkan/reshape_vulkan.cpp +++ b/src/layer/vulkan/reshape_vulkan.cpp @@ -275,7 +275,7 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute return 0; } - top_blob.create(_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); } else if (ndim == 2) { @@ -308,7 +308,7 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute return 0; } - top_blob.create(_w, _h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(_w, _h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); } else // if (ndim == 3) { @@ -348,7 +348,7 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute return 0; } - top_blob.create(_w, _h, _c / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(_w, _h, _c / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); } if (top_blob.empty()) diff --git a/src/layer/vulkan/shufflechannel_vulkan.cpp b/src/layer/vulkan/shufflechannel_vulkan.cpp index 43909f713..c5ea4262f 100644 --- a/src/layer/vulkan/shufflechannel_vulkan.cpp +++ b/src/layer/vulkan/shufflechannel_vulkan.cpp @@ -142,7 +142,7 @@ int ShuffleChannel_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, Vk size_t elemsize = bottom_blob.elemsize; int elempack = bottom_blob.elempack; - top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); + top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator); if (top_blob.empty()) return -100; diff --git a/src/layer/vulkan/slice_vulkan.cpp b/src/layer/vulkan/slice_vulkan.cpp index 481e3bd44..4711aefbd 100644 --- a/src/layer/vulkan/slice_vulkan.cpp +++ b/src/layer/vulkan/slice_vulkan.cpp @@ -314,7 +314,7 @@ int Slice_vulkan::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vectormappable) - return; - - if (staging_allocator && staging_data) - return; - - size_t totalsize = alignSize(total() * elemsize, 4); - staging_data = staging_allocator->fastMalloc(totalsize); - - staging_refcount = (int*)((unsigned char*)staging_data + offsetof(VkBufferMemory, refcount)); - *staging_refcount = 1; -} - -inline void VkMat::discard_staging_buffer() -{ - if (allocator->mappable) - return; - - if (staging_refcount && NCNN_XADD(staging_refcount, -1) == 1) - { - if (staging_allocator && staging_data) - { - staging_allocator->fastFree(staging_data); - } - } - - staging_data = 0; - staging_refcount = 0; -} - -inline void VkMat::upload(const Mat& m) -{ - memcpy(mapped_ptr(), m.data, m.total() * m.elemsize); - - if (allocator->mappable) - { - allocator->flush(data); - } -} - -inline void VkMat::download(Mat& m) const -{ - if (allocator->mappable) - { - allocator->invalidate(data); - } - - memcpy(m.data, mapped_ptr(), total() * elemsize); + create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator); } inline Mat VkMat::mapped() const { + if (!allocator->mappable) + return Mat(); + if (dims == 1) return Mat(w, mapped_ptr(), elemsize, elempack, 0); @@ -1629,17 +1551,16 @@ inline Mat VkMat::mapped() const inline void* VkMat::mapped_ptr() const { - VkBufferMemory* mappable_data = allocator->mappable ? data : staging_data; - return (unsigned char*)mappable_data->mapped_ptr + mappable_data->offset; + if (!allocator->mappable) + return 0; + + return (unsigned char*)data->mapped_ptr + data->offset; } inline void VkMat::addref() { if (refcount) NCNN_XADD(refcount, 1); - - if (staging_refcount) - NCNN_XADD(staging_refcount, 1); } inline void VkMat::release() @@ -1652,16 +1573,7 @@ inline void VkMat::release() } } - if (staging_refcount && NCNN_XADD(staging_refcount, -1) == 1) - { - if (staging_allocator && staging_data) - { - staging_allocator->fastFree(staging_data); - } - } - data = 0; - staging_data = 0; elemsize = 0; elempack = 0; @@ -1674,7 +1586,6 @@ inline void VkMat::release() cstep = 0; refcount = 0; - staging_refcount = 0; } inline bool VkMat::empty() const @@ -1709,14 +1620,9 @@ inline size_t VkMat::buffer_offset() const return data->offset; } -inline VkBuffer VkMat::staging_buffer() const -{ - return staging_data->buffer; -} - -inline size_t VkMat::staging_buffer_offset() const +inline size_t VkMat::buffer_capacity() const { - return staging_data->offset; + return data->capacity; } inline VkImageMat::VkImageMat() diff --git a/src/net.cpp b/src/net.cpp index 846c16c00..aa36af833 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -924,14 +924,16 @@ int Net::upload_model() weight_staging_vkallocator = new VkWeightStagingBufferAllocator(vkdev); } - cmd.weight_vkallocator = weight_vkallocator; - cmd.staging_vkallocator = weight_staging_vkallocator; + Option opt_upload = opt; + opt_upload.blob_vkallocator = weight_vkallocator; + opt_upload.workspace_vkallocator = weight_vkallocator; + opt_upload.staging_vkallocator = weight_staging_vkallocator; for (size_t i=0; isupport_vulkan) { - int uret = layers[i]->upload_model(cmd, opt); + int uret = layers[i]->upload_model(cmd, opt_upload); if (uret != 0) { fprintf(stderr, "layer upload_model %d failed\n", (int)i); @@ -1347,12 +1349,7 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector // upload VkMat bottom_blob_unpacked; - bottom_blob_unpacked.create_like(bottom_blob_cpu_fp16, opt.blob_vkallocator, opt.staging_vkallocator); - - bottom_blob_unpacked.prepare_staging_buffer(); - bottom_blob_unpacked.upload(bottom_blob_cpu_fp16); - - cmd.record_upload(bottom_blob_unpacked); + cmd.record_upload(bottom_blob_cpu_fp16, bottom_blob_unpacked, opt); // cast to fp16 (integrated gpu) VkMat bottom_blob_unpacked_fp16; @@ -1390,11 +1387,8 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector if (layer->support_inplace && *bottom_blob.refcount != 1) { VkMat bottom_blob_copy; - bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator); - + cmd.record_clone(bottom_blob, bottom_blob_copy, opt); // fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); - - cmd.record_clone(bottom_blob, bottom_blob_copy); bottom_blob = bottom_blob_copy; } } @@ -1437,7 +1431,6 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector { // load bottom blobs std::vector bottom_blobs(layer->bottoms.size()); - std::vector bottom_blobs_unpacked(layer->bottoms.size()); for (size_t i=0; ibottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; @@ -1471,13 +1464,8 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector } // upload - VkMat& bottom_blob_unpacked = bottom_blobs_unpacked[i]; - bottom_blob_unpacked.create_like(bottom_blob_cpu_fp16, opt.blob_vkallocator, opt.staging_vkallocator); - - bottom_blob_unpacked.prepare_staging_buffer(); - bottom_blob_unpacked.upload(bottom_blob_cpu_fp16); - - cmd.record_upload(bottom_blob_unpacked); + VkMat bottom_blob_unpacked; + cmd.record_upload(bottom_blob_cpu_fp16, bottom_blob_unpacked, opt); // cast to fp16 (integrated gpu) VkMat bottom_blob_unpacked_fp16; @@ -1515,11 +1503,8 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector if (layer->support_inplace && *bottom_blobs[i].refcount != 1) { VkMat bottom_blob_copy; - bottom_blob_copy.create_like(bottom_blobs[i], bottom_blobs[i].allocator, bottom_blobs[i].staging_allocator); - + cmd.record_clone(bottom_blobs[i], bottom_blob_copy, opt); // fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blobs[i].buffer(), bottom_blobs[i].buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); - - cmd.record_clone(bottom_blobs[i], bottom_blob_copy); bottom_blobs[i] = bottom_blob_copy; } } @@ -1602,11 +1587,8 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector if (layer->support_inplace && *bottom_blob.refcount != 1) { VkMat bottom_blob_copy; - bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator); - + cmd.record_clone(bottom_blob, bottom_blob_copy, opt); // fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); - - cmd.record_clone(bottom_blob, bottom_blob_copy); bottom_blob = bottom_blob_copy; } } @@ -1614,7 +1596,6 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector VkMat bottom_blob_unpacked_fp16; if (opt.use_packing_layout && layer->support_packing) { -// bottom_blob_unpacked_fp16 = bottom_blob; packing_pack4->forward(bottom_blob, bottom_blob_unpacked_fp16, cmd, opt); } else @@ -1635,8 +1616,8 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector } // download - bottom_blob_unpacked.prepare_staging_buffer(); - cmd.record_download(bottom_blob_unpacked); + Mat bottom_blob_cpu_fp16; + cmd.record_download(bottom_blob_unpacked, bottom_blob_cpu_fp16, opt); cmd.submit_and_wait(); @@ -1657,12 +1638,6 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector cmd.reset(); - Mat bottom_blob_cpu_fp16; - bottom_blob_cpu_fp16.create_like(bottom_blob_unpacked, opt.blob_allocator); - bottom_blob_unpacked.download(bottom_blob_cpu_fp16); - - bottom_blob_unpacked.discard_staging_buffer(); - // cast to fp32 (discrete gpu) Mat& bottom_blob_cpu = blob_mats[bottom_blob_index]; if (opt.use_fp16_storage && vkdev->info.type == 0) @@ -1742,7 +1717,7 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector else { // load bottom blobs - std::vector bottom_blobs_unpacked(layer->bottoms.size()); + std::vector bottom_blobs_cpu_fp16(layer->bottoms.size()); for (size_t i=0; ibottoms.size(); i++) { int bottom_blob_index = layer->bottoms[i]; @@ -1770,11 +1745,8 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector if (layer->support_inplace && *bottom_blob.refcount != 1) { VkMat bottom_blob_copy; - bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator); - + cmd.record_clone(bottom_blob, bottom_blob_copy, opt); // fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); - - cmd.record_clone(bottom_blob, bottom_blob_copy); bottom_blob = bottom_blob_copy; } } @@ -1782,7 +1754,6 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector VkMat bottom_blob_unpacked_fp16; if (opt.use_packing_layout && layer->support_packing) { -// bottom_blob_unpacked_fp16 = bottom_blob; packing_pack4->forward(bottom_blob, bottom_blob_unpacked_fp16, cmd, opt); } else @@ -1792,7 +1763,7 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector } // cast to fp32 (integrated gpu) - VkMat& bottom_blob_unpacked = bottom_blobs_unpacked[i]; + VkMat bottom_blob_unpacked; if (opt.use_fp16_storage && vkdev->info.type != 0) { cast_float16_to_float32->forward(bottom_blob_unpacked_fp16, bottom_blob_unpacked, cmd, opt); @@ -1803,8 +1774,8 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector } // download - bottom_blob_unpacked.prepare_staging_buffer(); - cmd.record_download(bottom_blob_unpacked); + Mat& bottom_blob_cpu_fp16 = bottom_blobs_cpu_fp16[i]; + cmd.record_download(bottom_blob_unpacked, bottom_blob_cpu_fp16, opt); } } } @@ -1837,13 +1808,7 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector if (blob_mats[bottom_blob_index].dims == 0) { - VkMat& bottom_blob_unpacked = bottom_blobs_unpacked[i]; - - Mat bottom_blob_cpu_fp16; - bottom_blob_cpu_fp16.create_like(bottom_blob_unpacked, opt.blob_allocator); - bottom_blob_unpacked.download(bottom_blob_cpu_fp16); - - bottom_blob_unpacked.discard_staging_buffer(); + const Mat& bottom_blob_cpu_fp16 = bottom_blobs_cpu_fp16[i]; // cast to fp32 (discrete gpu) Mat& bottom_blob_cpu = blob_mats[bottom_blob_index]; @@ -1884,7 +1849,7 @@ int Net::forward_layer(int layer_index, std::vector& blob_mats, std::vector } } - bottom_blobs_unpacked.clear(); + bottom_blobs_cpu_fp16.clear(); // forward if (opt.lightmode && layer->support_inplace) @@ -2113,15 +2078,15 @@ int Extractor::extract(int blob_index, Mat& feat) } // download - feat_gpu_unpacked.prepare_staging_buffer(); - cmd.record_download(feat_gpu_unpacked); + Mat feat_cpu_fp16; + cmd.record_download(feat_gpu_unpacked, feat_cpu_fp16, opt); cmd.submit_and_wait(); #if NCNN_BENCHMARK std::vector results(net->layers.size() * 2); cmd.get_query_pool_results(0, net->layers.size() * 2, results); - for (int i=0; ilayers.size(); i++) + for (size_t i=0; ilayers.size(); i++) { uint64_t start = results[i*2]; uint64_t end = results[i*2+1]; @@ -2133,12 +2098,6 @@ int Extractor::extract(int blob_index, Mat& feat) } #endif // NCNN_BENCHMARK - Mat feat_cpu_fp16; - feat_cpu_fp16.create_like(feat_gpu_unpacked, opt.blob_allocator); - feat_gpu_unpacked.download(feat_cpu_fp16); - - feat_gpu_unpacked.discard_staging_buffer(); - // cast to fp32 (discrete gpu) Mat& feat_cpu = blob_mats[blob_index]; if (opt.use_fp16_storage && net->vkdev->info.type == 0) diff --git a/src/pipeline.cpp b/src/pipeline.cpp index b587d6674..3a47a23da 100644 --- a/src/pipeline.cpp +++ b/src/pipeline.cpp @@ -267,7 +267,7 @@ int Pipeline::create_pipeline_layout(int push_constant_count) VkPushConstantRange pushConstantRange; pushConstantRange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; pushConstantRange.offset = 0; - pushConstantRange.size = sizeof(int) * push_constant_count; + pushConstantRange.size = sizeof(vk_constant_type) * push_constant_count; VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo; pipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; diff --git a/tests/test_cast.cpp b/tests/test_cast.cpp index 5602bd4a5..42c004f29 100644 --- a/tests/test_cast.cpp +++ b/tests/test_cast.cpp @@ -207,39 +207,30 @@ static int test_cast_gpu_fp16p(const ncnn::Mat& a, int type_from, int type_to) a4_fp16 = a4; } - // upload - ncnn::VkMat a4_gpu; - a4_gpu.create_like(a4_fp16, opt.blob_vkallocator, opt.staging_vkallocator); - a4_gpu.prepare_staging_buffer(); - a4_gpu.upload(a4_fp16); - // forward ncnn::VkCompute cmd(vkdev); - cmd.record_upload(a4_gpu); + // upload + ncnn::VkMat a4_gpu; + cmd.record_upload(a4_fp16, a4_gpu, opt); ncnn::VkMat d4_gpu; if (op->support_inplace) { - d4_gpu.create_like(a4_gpu, a4_gpu.allocator, a4_gpu.staging_allocator); - cmd.record_clone(a4_gpu, d4_gpu); - op->forward_inplace(d4_gpu, cmd, opt); + op->forward_inplace(a4_gpu, cmd, opt); + + d4_gpu = a4_gpu; } else { op->forward(a4_gpu, d4_gpu, cmd, opt); } - d4_gpu.prepare_staging_buffer(); - - cmd.record_download(d4_gpu); + // download + cmd.record_download(d4_gpu, d, opt); cmd.submit_and_wait(); - // download - d.create_like(d4_gpu); - d4_gpu.download(d); - op->destroy_pipeline(opt); delete op; @@ -331,39 +322,30 @@ static int test_cast_gpu_fp16p_pack8(const ncnn::Mat& a, int type_from, int type a4_fp16 = a4; } - // upload - ncnn::VkMat a4_gpu; - a4_gpu.create_like(a4_fp16, opt.blob_vkallocator, opt.staging_vkallocator); - a4_gpu.prepare_staging_buffer(); - a4_gpu.upload(a4_fp16); - // forward ncnn::VkCompute cmd(vkdev); - cmd.record_upload(a4_gpu); + // upload + ncnn::VkMat a4_gpu; + cmd.record_upload(a4_fp16, a4_gpu, opt); ncnn::VkMat d4_gpu; if (op->support_inplace) { - d4_gpu.create_like(a4_gpu, a4_gpu.allocator, a4_gpu.staging_allocator); - cmd.record_clone(a4_gpu, d4_gpu); - op->forward_inplace(d4_gpu, cmd, opt); + op->forward_inplace(a4_gpu, cmd, opt); + + d4_gpu = a4_gpu; } else { op->forward(a4_gpu, d4_gpu, cmd, opt); } - d4_gpu.prepare_staging_buffer(); - - cmd.record_download(d4_gpu); + // download + cmd.record_download(d4_gpu, d, opt); cmd.submit_and_wait(); - // download - d.create_like(d4_gpu); - d4_gpu.download(d); - op->destroy_pipeline(opt); delete op; diff --git a/tests/testutil.h b/tests/testutil.h index 88bd8fee9..cd42ead5b 100644 --- a/tests/testutil.h +++ b/tests/testutil.h @@ -261,10 +261,13 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vectorupload_model(cmd, opt); + ncnn::Option opt_upload = opt; + opt_upload.blob_vkallocator = &g_weight_vkallocator; + opt_upload.workspace_vkallocator = &g_weight_vkallocator; + opt_upload.staging_vkallocator = &g_weight_staging_vkallocator; + + op->upload_model(cmd, opt_upload); cmd.submit_and_wait(); } @@ -367,57 +370,35 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector a4_fp16_gpu(a4_fp16.size()); - for (size_t i=0; i a4_fp16_gpu(a4_fp16.size()); for (size_t i=0; i d4_fp16_gpu(top_blob_count); if (op->support_inplace) { - for (size_t i=0; iforward_inplace(a4_fp16_gpu, cmd, opt); - op->forward_inplace(d4_fp16_gpu, cmd, opt); + d4_fp16_gpu = a4_fp16_gpu; } else { op->forward(a4_fp16_gpu, d4_fp16_gpu, cmd, opt); } + // download for (size_t i=0; iupload_model(cmd, opt); + ncnn::Option opt_upload = opt; + opt_upload.blob_vkallocator = &g_weight_vkallocator; + opt_upload.workspace_vkallocator = &g_weight_vkallocator; + opt_upload.staging_vkallocator = &g_weight_staging_vkallocator; - cmd.submit_and_wait(); + op->upload_model(cmd, opt_upload); - g_weight_staging_vkallocator.clear(); + cmd.submit_and_wait(); } #endif // NCNN_VULKAN @@ -594,38 +576,29 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vectorsupport_inplace) { - d4_fp16_gpu.create_like(a4_fp16_gpu, a4_fp16_gpu.allocator, a4_fp16_gpu.staging_allocator); - cmd.record_clone(a4_fp16_gpu, d4_fp16_gpu); - op->forward_inplace(d4_fp16_gpu, cmd, opt); + op->forward_inplace(a4_fp16_gpu, cmd, opt); + + d4_fp16_gpu = a4_fp16_gpu; } else { op->forward(a4_fp16_gpu, d4_fp16_gpu, cmd, opt); } - d4_fp16_gpu.prepare_staging_buffer(); - - cmd.record_download(d4_fp16_gpu); + // download + cmd.record_download(d4_fp16_gpu, d, opt); cmd.submit_and_wait(); - - // download - d.create_like(d4_fp16_gpu); - d4_fp16_gpu.download(d); } #endif // NCNN_VULKAN