* vkmat and command api breaks * always use compute queue for compute buffer transfer * no barrier for readonly weight buffer * record clone, drop queue_owner * bring back layer forward * fix validation errors * lifecycle inside command makes life easier * update doc * record_import_android_hardware_buffertags/20200413
| @@ -150,6 +150,17 @@ ncnn::VkWeightStagingBufferAllocator g_weight_staging_vkallocator(vkdev); | |||
| ncnn::Layer* convolution = ncnn::create_layer("Convolution"); | |||
| convolution->vkdev = vkdev; | |||
| // set option | |||
| ncnn::Option opt; | |||
| opt.lightmode = true; | |||
| opt.num_threads = 4; | |||
| opt.blob_allocator = 0; | |||
| opt.workspace_allocator = 0; | |||
| opt.vulkan_compute = true; | |||
| opt.blob_vkallocator = &g_blob_vkallocator; | |||
| opt.workspace_vkallocator = &g_blob_vkallocator; | |||
| opt.staging_vkallocator = &g_staging_vkallocator; | |||
| // load param | |||
| { | |||
| ncnn::ParamDict pd; | |||
| @@ -171,76 +182,42 @@ ncnn::ModelBinFromMatArray mb(weights); | |||
| convolution->load_model(mb); | |||
| } | |||
| // upload model | |||
| { | |||
| ncnn::VkTransfer cmd(vkdev); | |||
| cmd.weight_vkallocator = &g_weight_vkallocator; | |||
| cmd.staging_vkallocator = &g_weight_staging_vkallocator; | |||
| convolution->upload_model(cmd); | |||
| cmd.submit(); | |||
| cmd.wait(); | |||
| g_weight_staging_vkallocator.clear(); | |||
| } | |||
| // create pipeline | |||
| convolution->create_pipeline(opt); | |||
| // set default option | |||
| // upload model | |||
| { | |||
| ncnn::Option opt = ncnn::get_default_option(); | |||
| ncnn::VkTransfer cmd(vkdev); | |||
| opt.lightmode = true; | |||
| opt.num_threads = 4; | |||
| opt.blob_allocator = 0; | |||
| opt.workspace_allocator = 0; | |||
| ncnn::Option opt_upload = opt; | |||
| opt_upload.blob_vkallocator = &g_weight_vkallocator; | |||
| opt_upload.workspace_vkallocator = &g_weight_vkallocator; | |||
| opt_upload.staging_vkallocator = &g_weight_staging_vkallocator; | |||
| opt.vulkan_compute = true; | |||
| opt.blob_vkallocator = &g_blob_vkallocator; | |||
| opt.workspace_vkallocator = &g_blob_vkallocator; | |||
| opt.staging_vkallocator = &g_staging_vkallocator; | |||
| convolution->upload_model(cmd, opt_upload); | |||
| ncnn::set_default_option(opt); | |||
| cmd.submit_and_wait(); | |||
| } | |||
| ncnn::Mat bottom = random_mat(w, h, inch); | |||
| ncnn::VkMat bottom_gpu; | |||
| // copy bottom to bottom_gpu | |||
| { | |||
| bottom_gpu.create_like(bottom, &g_blob_vkallocator, &g_staging_vkallocator); | |||
| bottom_gpu.prepare_staging_buffer(); | |||
| bottom_gpu.upload(bottom); | |||
| } | |||
| ncnn::VkMat top_gpu; | |||
| ncnn::Mat top; | |||
| // forward | |||
| { | |||
| ncnn::VkCompute cmd(vkdev); | |||
| cmd.record_upload(bottom_gpu); | |||
| ncnn::VkMat bottom_gpu; | |||
| cmd.record_upload(bottom, bottom_gpu, opt); | |||
| ncnn::VkMat top_gpu; | |||
| convolution->forward(bottom_gpu, top_gpu, cmd, opt); | |||
| top_gpu.prepare_staging_buffer(); | |||
| cmd.record_download(top_gpu); | |||
| cmd.record_download(top_gpu, top, opt); | |||
| cmd.submit_and_wait(); | |||
| } | |||
| ncnn::Mat top; | |||
| // copy top_gpu to top | |||
| { | |||
| top.create_like(top_gpu); | |||
| top_gpu.download(top); | |||
| } | |||
| convolution->destroy_pipeline(opt); | |||
| delete convolution; | |||
| @@ -470,7 +470,8 @@ VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size) | |||
| ptr->memory = buffer_blocks[i]->memory; | |||
| ptr->capacity = aligned_size; | |||
| ptr->mapped_ptr = buffer_blocks[i]->mapped_ptr; | |||
| ptr->state = 1; | |||
| ptr->access_flags = 0; | |||
| ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; | |||
| // adjust budgets | |||
| if (budget_size == aligned_size) | |||
| @@ -540,7 +541,8 @@ VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size) | |||
| ptr->memory = block->memory; | |||
| ptr->capacity = aligned_size; | |||
| ptr->mapped_ptr = block->mapped_ptr; | |||
| ptr->state = 1; | |||
| ptr->access_flags = 0; | |||
| ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; | |||
| // adjust budgets | |||
| std::list< std::pair<size_t, size_t> > budget; | |||
| @@ -715,7 +717,8 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size) | |||
| ptr->memory = buffer_blocks[block_index]->memory; | |||
| ptr->capacity = aligned_size; | |||
| ptr->mapped_ptr = buffer_blocks[block_index]->mapped_ptr; | |||
| ptr->state = 1; | |||
| ptr->access_flags = 0; | |||
| ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; | |||
| buffer_block_free_spaces[block_index] -= aligned_size; | |||
| @@ -790,7 +793,8 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size) | |||
| ptr->memory = block->memory; | |||
| ptr->capacity = new_block_size; | |||
| ptr->mapped_ptr = block->mapped_ptr; | |||
| ptr->state = 1; | |||
| ptr->access_flags = 0; | |||
| ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; | |||
| return ptr; | |||
| } | |||
| @@ -841,7 +845,8 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size) | |||
| ptr->memory = block->memory; | |||
| ptr->capacity = aligned_size; | |||
| ptr->mapped_ptr = block->mapped_ptr; | |||
| ptr->state = 1; | |||
| ptr->access_flags = 0; | |||
| ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; | |||
| return ptr; | |||
| } | |||
| @@ -940,7 +945,8 @@ VkBufferMemory* VkStagingBufferAllocator::fastMalloc(size_t size) | |||
| vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr); | |||
| ptr->state = 1; | |||
| ptr->access_flags = 0; | |||
| ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; | |||
| // fprintf(stderr, "VkStagingBufferAllocator M %p %lu\n", ptr->buffer, size); | |||
| @@ -989,7 +995,8 @@ VkBufferMemory* VkWeightStagingBufferAllocator::fastMalloc(size_t size) | |||
| vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr); | |||
| ptr->state = 1; | |||
| ptr->access_flags = 0; | |||
| ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; | |||
| // fprintf(stderr, "VkWeightStagingBufferAllocator M %p %lu\n", ptr->buffer, size); | |||
| @@ -1137,7 +1144,8 @@ VkImageMemory* VkSimpleImageAllocator::fastMalloc(int width, int height, VkForma | |||
| ptr->imageview = create_imageview(ptr->image, format); | |||
| ptr->state = 1; | |||
| ptr->access_flags = 0; | |||
| ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; | |||
| return ptr; | |||
| } | |||
| @@ -1290,7 +1298,8 @@ VkImageMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(int /*width*/, | |||
| ptr->image = image; | |||
| ptr->memory = memory; | |||
| ptr->imageview = imageview; | |||
| ptr->state = 1; | |||
| ptr->access_flags = 0; | |||
| ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; | |||
| return ptr; | |||
| } | |||
| @@ -192,12 +192,8 @@ public: | |||
| void* mapped_ptr; | |||
| // buffer state, modified by command functions internally | |||
| // 0=null | |||
| // 1=created | |||
| // 2=transfer | |||
| // 3=compute | |||
| // 4=readonly | |||
| mutable int state; | |||
| mutable VkAccessFlags access_flags; | |||
| mutable VkPipelineStageFlags stage_flags; | |||
| // initialize and modified by mat | |||
| int refcount; | |||
| @@ -311,13 +307,9 @@ public: | |||
| VkDeviceMemory memory; | |||
| // buffer state, modified by command functions internally | |||
| // 0=null | |||
| // 1=created | |||
| // 2=transfer | |||
| // 3=compute | |||
| // 4=readonly | |||
| mutable int state; | |||
| // image state, modified by command functions internally | |||
| mutable VkAccessFlags access_flags; | |||
| mutable VkPipelineStageFlags stage_flags; | |||
| // initialize and modified by mat | |||
| int refcount; | |||
| @@ -1,6 +1,6 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| @@ -22,61 +22,31 @@ | |||
| #include <vector> | |||
| #include <vulkan/vulkan.h> | |||
| #include "mat.h" | |||
| #include "pipeline.h" | |||
| namespace ncnn { | |||
| class Command | |||
| { | |||
| public: | |||
| Command(const VulkanDevice* vkdev, uint32_t queue_family_index); | |||
| virtual ~Command(); | |||
| protected: | |||
| int create_command_pool(); | |||
| int create_command_buffer(); | |||
| // record issue | |||
| int begin_command_buffer(); | |||
| int end_command_buffer(); | |||
| int queue_submit_and_wait_fence(); | |||
| protected: | |||
| const VulkanDevice* vkdev; | |||
| uint32_t queue_family_index; | |||
| VkCommandPool command_pool; | |||
| VkCommandBuffer command_buffer; | |||
| VkFence fence; | |||
| }; | |||
| class VkCompute : public Command | |||
| class Pipeline; | |||
| class VkCompute | |||
| { | |||
| public: | |||
| VkCompute(const VulkanDevice* vkdev); | |||
| ~VkCompute(); | |||
| void record_upload(const VkMat& m); | |||
| void record_download(const VkMat& m); | |||
| virtual ~VkCompute(); | |||
| void record_clone(const VkMat& src, const VkMat& dst); | |||
| public: | |||
| void record_upload(const Mat& src, VkMat& dst, const Option& opt); | |||
| void record_copy_region(const VkMat& src, const VkMat& dst, const VkBufferCopy& region); | |||
| void record_download(const VkMat& src, Mat& dst, const Option& opt); | |||
| void record_copy_regions(const VkMat& src, const VkMat& dst, const std::vector<VkBufferCopy>& regions); | |||
| void record_clone(const VkMat& src, VkMat& dst, const Option& opt); | |||
| void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& m); | |||
| void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher); | |||
| #if NCNN_BENCHMARK | |||
| void record_write_timestamp(uint32_t query); | |||
| #endif // NCNN_BENCHMARK | |||
| void record_queue_transfer_acquire(const VkMat& m, uint32_t src_queue_family_index); | |||
| #if __ANDROID_API__ >= 26 | |||
| void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& im, const VkMat& m); | |||
| void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst); | |||
| #endif // __ANDROID_API__ >= 26 | |||
| int submit_and_wait(); | |||
| @@ -90,116 +60,73 @@ public: | |||
| #endif // NCNN_BENCHMARK | |||
| protected: | |||
| // record pipeline things | |||
| void record_bind_pipeline(VkPipeline pipeline); | |||
| void record_update_bindings(VkPipelineLayout pipeline_layout, VkDescriptorSetLayout descriptorset_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const std::vector<VkMat>& bindings); | |||
| void record_push_constants(VkPipelineLayout pipeline_layout, const std::vector<vk_constant_type>& constants); | |||
| void record_dispatch(const uint32_t* group_count_xyz); | |||
| // record barrier things | |||
| void record_transfer_compute_barrier(const VkMat& m); | |||
| void record_compute_transfer_barrier(const VkMat& m); | |||
| void record_compute_compute_barrier(const VkMat& m); | |||
| void record_transfer_transfer_barrier(const VkMat& m); | |||
| void record_host_transfer_barrier(const VkMat& m); | |||
| void record_transfer_host_barrier(const VkMat& m); | |||
| void record_host_compute_barrier(const VkMat& m); | |||
| void record_compute_host_barrier(const VkMat& m); | |||
| // record prepare things | |||
| void record_prepare_transfer_barrier(const VkMat& m); | |||
| void record_prepare_compute_barrier(const VkMat& m); | |||
| void record_prepare_host_barrier(const VkMat& m); | |||
| void record_initial_image_compute_barrier(const VkImageMat& im); | |||
| int init(); | |||
| int begin_command_buffer(); | |||
| int end_command_buffer(); | |||
| #if __ANDROID_API__ >= 26 | |||
| void record_update_import_android_hardware_buffer_bindings(VkPipelineLayout pipeline_layout, VkDescriptorSetLayout descriptorset_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, VkSampler sampler, const VkImageMat& im, const VkMat& m); | |||
| #endif // __ANDROID_API__ >= 26 | |||
| protected: | |||
| const VulkanDevice* vkdev; | |||
| #if NCNN_BENCHMARK | |||
| void reset_query_pool(); | |||
| #endif // NCNN_BENCHMARK | |||
| VkCommandPool compute_command_pool; | |||
| protected: | |||
| // recording issue | |||
| void copy_buffer(VkBuffer src, size_t src_offset, VkBuffer dst, size_t dst_offset, size_t size); | |||
| void copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vector<VkBufferCopy>& regions); | |||
| void bind_pipeline(VkPipeline pipeline); | |||
| void bind_descriptorset(VkPipelineLayout pipeline_layout, VkDescriptorSet descriptorset); | |||
| void update_bindings(VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const std::vector<VkDescriptorBufferInfo>& descriptorBufferInfos); | |||
| void push_constants(VkPipelineLayout pipeline_layout, const std::vector<vk_constant_type>& constants); | |||
| void dispatch(const uint32_t* group_count_xyz); | |||
| void transfer_compute_barrier(VkBuffer buffer, size_t offset, size_t size); | |||
| void compute_transfer_barrier(VkBuffer buffer, size_t offset, size_t size); | |||
| void compute_compute_barrier(VkBuffer buffer, size_t offset, size_t size); | |||
| void transfer_transfer_barrier(VkBuffer buffer, size_t offset, size_t size); | |||
| void host_transfer_barrier(VkBuffer buffer, size_t offset, size_t size); | |||
| void transfer_host_barrier(VkBuffer buffer, size_t offset, size_t size); | |||
| void host_compute_barrier(VkBuffer buffer, size_t offset, size_t size); | |||
| void compute_host_barrier(VkBuffer buffer, size_t offset, size_t size); | |||
| void queue_transfer_acquire_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t src_queue_family_index); | |||
| void initial_image_compute_barrier(VkImage image); | |||
| #if __ANDROID_API__ >= 26 | |||
| void update_import_android_hardware_buffer_bindings(VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const VkDescriptorImageInfo& descriptorImageInfo, const VkDescriptorBufferInfo& descriptorBufferInfo); | |||
| #endif // __ANDROID_API__ >= 26 | |||
| #if NCNN_BENCHMARK | |||
| void write_timestamp(uint32_t query); | |||
| #endif // NCNN_BENCHMARK | |||
| VkCommandBuffer compute_command_buffer; | |||
| VkFence compute_command_fence; | |||
| std::vector<VkMat> upload_staging_buffers; | |||
| std::vector<VkMat> download_post_buffers; | |||
| std::vector<Mat> download_post_mats; | |||
| protected: | |||
| // delayed record | |||
| // the good-old path for device without VK_KHR_push_descriptor | |||
| std::vector<VkDescriptorPool> descriptor_pools; | |||
| std::vector<VkDescriptorSet> descriptorsets; | |||
| struct record_type | |||
| struct record | |||
| { | |||
| // 0=copy | |||
| // 1=copy regions | |||
| // 2=bind pipeline | |||
| // 3=bind descriptorset | |||
| // 4=push constants | |||
| // 5=dispatch | |||
| // 6=transfer-compute barrier | |||
| // 7=compute-transfer barrier | |||
| // 8=compute-compute barrier | |||
| // 9=transfer-transfer barrier | |||
| // 10=write timestamp | |||
| // 11=initial image compute barrier | |||
| // 12=host-transfer barrier | |||
| // 13=transfer-host barrier | |||
| // 14=host-compute barrier | |||
| // 15=compute-host barrier | |||
| // 16=queue-transfer-acquire barrier | |||
| enum | |||
| { | |||
| TYPE_copy_buffer, | |||
| TYPE_bind_pipeline, | |||
| TYPE_bind_descriptorsets, | |||
| TYPE_push_constants, | |||
| TYPE_dispatch, | |||
| TYPE_memory_barrers, | |||
| TYPE_buffer_barrers, | |||
| TYPE_image_barrers, | |||
| #if NCNN_BENCHMARK | |||
| TYPE_write_timestamp, | |||
| #endif // NCNN_BENCHMARK | |||
| TYPE_post_download, | |||
| }; | |||
| int type; | |||
| VkCommandBuffer command_buffer; | |||
| union | |||
| { | |||
| struct { VkBuffer src; size_t src_offset; VkBuffer dst; size_t dst_offset; size_t size; } copy; | |||
| struct { VkBuffer src; VkBuffer dst; } copy_regions; | |||
| struct { VkPipeline pipeline; } bind_pipeline; | |||
| struct { VkPipelineLayout pipeline_layout; VkDescriptorSet descriptorset; } bind_descriptorset; | |||
| struct { VkPipelineLayout pipeline_layout; } push_constants; | |||
| struct { uint32_t group_count_xyz[3]; } dispatch; | |||
| struct { VkBuffer buffer; size_t offset; size_t size; } transfer_compute_barrier; | |||
| struct { VkBuffer buffer; size_t offset; size_t size; } compute_transfer_barrier; | |||
| struct { VkBuffer buffer; size_t offset; size_t size; } compute_compute_barrier; | |||
| struct { VkBuffer buffer; size_t offset; size_t size; } transfer_transfer_barrier; | |||
| struct { VkBuffer src; VkBuffer dst; uint32_t region_count; const VkBufferCopy* regions; } copy_buffer; | |||
| struct { VkPipelineBindPoint bind_point; VkPipeline pipeline; } bind_pipeline; | |||
| struct { VkPipelineBindPoint bind_point; VkPipelineLayout pipeline_layout; uint32_t descriptorset_count; uint32_t descriptorset_offset; } bind_descriptorsets; | |||
| struct { VkPipelineLayout pipeline_layout; VkShaderStageFlags stage_flags; uint32_t size; const void* values; } push_constants; | |||
| struct { uint32_t group_count_x; uint32_t group_count_y; uint32_t group_count_z; } dispatch; | |||
| struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkMemoryBarrier* barriers; } memory_barrers; | |||
| struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkBufferMemoryBarrier* barriers; } buffer_barrers; | |||
| struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkImageMemoryBarrier* barriers; } image_barrers; | |||
| #if NCNN_BENCHMARK | |||
| struct { uint32_t query; } write_timestamp; | |||
| #endif // NCNN_BENCHMARK | |||
| struct { VkImage image; } initial_image_compute_barrier; | |||
| struct { VkBuffer buffer; size_t offset; size_t size; } host_transfer_barrier; | |||
| struct { VkBuffer buffer; size_t offset; size_t size; } transfer_host_barrier; | |||
| struct { VkBuffer buffer; size_t offset; size_t size; } host_compute_barrier; | |||
| struct { VkBuffer buffer; size_t offset; size_t size; } compute_host_barrier; | |||
| struct { VkBuffer buffer; size_t offset; size_t size; size_t src_queue_family_index; } queue_transfer_acquire_barrier; | |||
| }; | |||
| std::vector<VkBufferCopy> regions; | |||
| std::vector<vk_constant_type> constants; | |||
| struct { uint32_t download_post_buffer_mat_offset; } post_download; | |||
| }; | |||
| }; | |||
| std::vector<record_type> delayed_records; | |||
| std::vector<record> delayed_records; | |||
| #if NCNN_BENCHMARK | |||
| uint32_t query_count; | |||
| @@ -207,38 +134,37 @@ protected: | |||
| #endif // NCNN_BENCHMARK | |||
| }; | |||
| class VkTransfer : public Command | |||
| class VkTransfer | |||
| { | |||
| public: | |||
| VkTransfer(const VulkanDevice* vkdev); | |||
| ~VkTransfer(); | |||
| public: | |||
| void record_upload(const Mat& src, VkMat& dst, const Option& opt); | |||
| int submit_and_wait(); | |||
| public: | |||
| VkAllocator* weight_vkallocator; | |||
| VkAllocator* staging_vkallocator; | |||
| protected: | |||
| // recording issue | |||
| void copy_buffer(VkBuffer src, size_t src_offset, VkBuffer dst, size_t dst_offset, size_t size); | |||
| void copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vector<VkBufferCopy>& regions); | |||
| void queue_transfer_release_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t target_queue_family_index); | |||
| int init(); | |||
| int begin_command_buffer(); | |||
| int end_command_buffer(); | |||
| protected: | |||
| size_t buffer_offset_alignment; | |||
| VkBufferMemory* staging_data; | |||
| const VulkanDevice* vkdev; | |||
| // delayed record | |||
| struct record_type | |||
| { | |||
| size_t size; | |||
| Mat mat; | |||
| VkMat vkmat; | |||
| }; | |||
| std::vector<record_type> delayed_records; | |||
| VkCommandPool compute_command_pool; | |||
| VkCommandPool transfer_command_pool; | |||
| VkCommandBuffer upload_command_buffer; | |||
| VkCommandBuffer compute_command_buffer; | |||
| VkSemaphore upload_compute_semaphore; | |||
| VkFence upload_command_fence; | |||
| VkFence compute_command_fence; | |||
| std::vector<VkMat> upload_staging_buffers; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -628,6 +628,8 @@ int create_gpu_instance() | |||
| gpu_info.graphics_queue_count = queueFamilyProperties[gpu_info.graphics_queue_family_index].queueCount; | |||
| gpu_info.transfer_queue_count = queueFamilyProperties[gpu_info.transfer_queue_family_index].queueCount; | |||
| gpu_info.unified_compute_transfer_queue = gpu_info.compute_queue_family_index == gpu_info.transfer_queue_family_index; | |||
| // cache memory properties | |||
| vkGetPhysicalDeviceMemoryProperties(physicalDevice, &gpu_info.physicalDeviceMemoryProperties); | |||
| @@ -111,6 +111,9 @@ public: | |||
| uint32_t graphics_queue_count; | |||
| uint32_t transfer_queue_count; | |||
| // property | |||
| bool unified_compute_transfer_queue; | |||
| // bug is not feature | |||
| bool bug_local_size_spec_const; | |||
| @@ -121,11 +121,7 @@ int Layer::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& t | |||
| top_blobs.resize(bottom_blobs.size()); | |||
| for (int i = 0; i < (int)top_blobs.size(); i++) | |||
| { | |||
| top_blobs[i].create_like(bottom_blobs[i], bottom_blobs[i].allocator, bottom_blobs[i].staging_allocator); | |||
| if (top_blobs[i].empty()) | |||
| return -100; | |||
| cmd.record_clone(bottom_blobs[i], top_blobs[i]); | |||
| cmd.record_clone(bottom_blobs[i], top_blobs[i], opt); | |||
| } | |||
| return forward_inplace(top_blobs, cmd, opt); | |||
| @@ -136,11 +132,7 @@ int Layer::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, co | |||
| if (!support_inplace) | |||
| return -1; | |||
| top_blob.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| cmd.record_clone(bottom_blob, top_blob); | |||
| cmd.record_clone(bottom_blob, top_blob, opt); | |||
| return forward_inplace(top_blob, cmd, opt); | |||
| } | |||
| @@ -317,21 +317,21 @@ int BinaryOp_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector | |||
| // broadcast | |||
| if (bottom_blob.dims > bottom_blob1.dims) | |||
| { | |||
| top_blob.create_like(bottom_blob, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create_like(bottom_blob, opt.blob_vkallocator); | |||
| } | |||
| else if (bottom_blob.dims < bottom_blob1.dims) | |||
| { | |||
| top_blob.create_like(bottom_blob1, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create_like(bottom_blob1, opt.blob_vkallocator); | |||
| } | |||
| else // if (bottom_blob.dims == bottom_blob1.dims) | |||
| { | |||
| if (bottom_blob.w * bottom_blob.h * bottom_blob.c * bottom_blob.elempack >= bottom_blob1.w * bottom_blob1.h * bottom_blob1.c * bottom_blob1.elempack) | |||
| { | |||
| top_blob.create_like(bottom_blob, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create_like(bottom_blob, opt.blob_vkallocator); | |||
| } | |||
| else | |||
| { | |||
| top_blob.create_like(bottom_blob1, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create_like(bottom_blob1, opt.blob_vkallocator); | |||
| } | |||
| } | |||
| if (top_blob.empty()) | |||
| @@ -234,15 +234,15 @@ int Cast_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c | |||
| if (dims == 1) | |||
| { | |||
| top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(w, out_elemsize, elempack, opt.blob_vkallocator); | |||
| } | |||
| else if (dims == 2) | |||
| { | |||
| top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(w, h, out_elemsize, elempack, opt.blob_vkallocator); | |||
| } | |||
| else if (dims == 3) | |||
| { | |||
| top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_vkallocator); | |||
| } | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -312,14 +312,14 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V | |||
| } | |||
| VkMat& top_blob = top_blobs[0]; | |||
| top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| VkMat top_blob_unpacked = top_blob; | |||
| if (elempack < out_elempack) | |||
| { | |||
| top_blob_unpacked.create(top_w / elempack, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| top_blob_unpacked.create(top_w / elempack, elemsize, elempack, opt.workspace_vkallocator); | |||
| if (top_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| @@ -415,14 +415,14 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V | |||
| } | |||
| VkMat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| VkMat top_blob_unpacked = top_blob; | |||
| if (elempack < out_elempack) | |||
| { | |||
| top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_vkallocator); | |||
| if (top_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| @@ -506,7 +506,7 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V | |||
| } | |||
| VkMat& top_blob = top_blobs[0]; | |||
| top_blob.create(top_w, h, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(top_w, h, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -573,14 +573,14 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V | |||
| } | |||
| VkMat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| VkMat top_blob_unpacked = top_blob; | |||
| if (elempack < out_elempack) | |||
| { | |||
| top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_vkallocator); | |||
| if (top_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| @@ -665,7 +665,7 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V | |||
| } | |||
| VkMat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -720,7 +720,7 @@ int Concat_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<V | |||
| } | |||
| VkMat& top_blob = top_blobs[0]; | |||
| top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -1010,8 +1010,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); | |||
| padding_param_blob.prepare_staging_buffer(); | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = hpad / 2; | |||
| @@ -1037,8 +1036,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); | |||
| padding_param_blob.prepare_staging_buffer(); | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = hpad - hpad / 2; | |||
| @@ -1089,8 +1087,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); | |||
| padding_param_blob.prepare_staging_buffer(); | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = 0; | |||
| @@ -1110,7 +1107,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| // transform input | |||
| VkMat bottom_tm_blob; | |||
| { | |||
| bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator); | |||
| if (bottom_tm_blob.empty()) | |||
| return -100; | |||
| @@ -1138,7 +1135,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| // gemm | |||
| VkMat top_tm_blob; | |||
| { | |||
| top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); | |||
| if (top_tm_blob.empty()) | |||
| return -100; | |||
| @@ -1165,7 +1162,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| // transform output | |||
| VkMat top_blob_bordered; | |||
| { | |||
| top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| @@ -1193,8 +1190,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| // crop top_blob | |||
| { | |||
| VkMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); | |||
| crop_param_blob.prepare_staging_buffer(); | |||
| VkMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* crop_params = crop_param_blob.mapped(); | |||
| crop_params[0] = 0; | |||
| @@ -1232,8 +1228,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); | |||
| padding_param_blob.prepare_staging_buffer(); | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = 0; | |||
| @@ -1253,7 +1248,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| // transform input | |||
| VkMat bottom_tm_blob; | |||
| { | |||
| bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator); | |||
| if (bottom_tm_blob.empty()) | |||
| return -100; | |||
| @@ -1281,7 +1276,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| // gemm | |||
| VkMat top_tm_blob; | |||
| { | |||
| top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); | |||
| if (top_tm_blob.empty()) | |||
| return -100; | |||
| @@ -1308,7 +1303,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| // transform output | |||
| VkMat top_blob_bordered; | |||
| { | |||
| top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| @@ -1336,8 +1331,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| // crop top_blob | |||
| { | |||
| VkMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); | |||
| crop_param_blob.prepare_staging_buffer(); | |||
| VkMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* crop_params = crop_param_blob.mapped(); | |||
| crop_params[0] = 0; | |||
| @@ -1360,7 +1354,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| } | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -534,8 +534,7 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); | |||
| padding_param_blob.prepare_staging_buffer(); | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = hpad / 2; | |||
| @@ -561,8 +560,7 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); | |||
| padding_param_blob.prepare_staging_buffer(); | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = hpad - hpad / 2; | |||
| @@ -595,7 +593,7 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -656,7 +654,7 @@ int ConvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_bl | |||
| VkMat top_blob_unpacked = top_blob; | |||
| if (out_elempack_g < out_elempack) | |||
| { | |||
| top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator); | |||
| if (top_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| @@ -381,7 +381,7 @@ int Crop_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& c | |||
| packing->forward(bottom_blob, bottom_blob_unpacked, cmd, opt_pack1); | |||
| } | |||
| top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -515,7 +515,7 @@ int Crop_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkM | |||
| VkMat& top_blob = top_blobs[0]; | |||
| top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -414,11 +414,11 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC | |||
| VkMat top_blob_bordered; | |||
| if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0)) | |||
| { | |||
| top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); | |||
| } | |||
| else | |||
| { | |||
| top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| } | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| @@ -528,8 +528,7 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC | |||
| int wcut = top_blob_bordered_adj.w - output_w; | |||
| int hcut = top_blob_bordered_adj.h - output_h; | |||
| VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); | |||
| crop_param_blob.prepare_staging_buffer(); | |||
| VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* crop_params = crop_param_blob.mapped(); | |||
| if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233) | |||
| @@ -597,11 +597,11 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_ | |||
| VkMat top_blob_bordered; | |||
| if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || output_pad_right > 0 || output_pad_bottom > 0 || (output_w > 0 && output_h > 0)) | |||
| { | |||
| top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); | |||
| } | |||
| else | |||
| { | |||
| top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| } | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| @@ -681,8 +681,7 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_ | |||
| int wcut = top_blob_bordered_adj.w - output_w; | |||
| int hcut = top_blob_bordered_adj.h - output_h; | |||
| VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); | |||
| crop_param_blob.prepare_staging_buffer(); | |||
| VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* crop_params = crop_param_blob.mapped(); | |||
| if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233) | |||
| @@ -763,7 +762,7 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_ | |||
| VkMat top_blob_unpacked = top_blob_bordered; | |||
| if (out_elempack_g < out_elempack) | |||
| { | |||
| top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| top_blob_unpacked.create(outw, outh, num_output / out_elempack_g, out_elemsize_g, out_elempack_g, opt.workspace_vkallocator); | |||
| if (top_blob_unpacked.empty()) | |||
| return -100; | |||
| } | |||
| @@ -883,8 +882,7 @@ int DeconvolutionDepthWise_vulkan::forward(const VkMat& bottom_blob, VkMat& top_ | |||
| int wcut = top_blob_bordered_adj.w - output_w; | |||
| int hcut = top_blob_bordered_adj.h - output_h; | |||
| VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); | |||
| crop_param_blob.prepare_staging_buffer(); | |||
| VkMat crop_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* crop_params = crop_param_blob.mapped(); | |||
| if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233) | |||
| @@ -144,7 +144,7 @@ int DeepCopy_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkComput | |||
| { | |||
| int elempack = bottom_blob.elempack; | |||
| top_blob.create_like(bottom_blob, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create_like(bottom_blob, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -157,7 +157,7 @@ int Eltwise_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector< | |||
| int elempack = bottom_blob.elempack; | |||
| VkMat& top_blob = top_blobs[0]; | |||
| top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -205,7 +205,7 @@ int Flatten_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| return 0; | |||
| } | |||
| top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -306,7 +306,7 @@ int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -380,7 +380,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, | |||
| int elempack = bottom_top_blob.elempack; | |||
| // mean | |||
| VkMat mean_workspace(c, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| VkMat mean_workspace(c, elemsize, elempack, opt.workspace_vkallocator); | |||
| { | |||
| // reduce sum | |||
| VkMat sum_workspace; | |||
| @@ -389,7 +389,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, | |||
| int reduced_h = 1; | |||
| int reduced_c = bottom_top_blob.c; | |||
| sum_workspace.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator); | |||
| { | |||
| std::vector<VkMat> bindings(2); | |||
| bindings[0] = bottom_top_blob; | |||
| @@ -419,7 +419,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, | |||
| int reduced_c = sum_workspace.c; | |||
| VkMat sum_workspace_reduced; | |||
| sum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator); | |||
| { | |||
| std::vector<VkMat> bindings(2); | |||
| @@ -466,11 +466,11 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, | |||
| } | |||
| // var | |||
| VkMat var_workspace(c, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| VkMat var_workspace(c, elemsize, elempack, opt.workspace_vkallocator); | |||
| { | |||
| // sub mean and square | |||
| VkMat square_workspace; | |||
| square_workspace.create(w, h, c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| square_workspace.create(w, h, c, 4u*elempack, elempack, opt.workspace_vkallocator); | |||
| { | |||
| std::vector<VkMat> bindings(3); | |||
| bindings[0] = bottom_top_blob; | |||
| @@ -509,7 +509,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, | |||
| int reduced_c = sqsum_workspace.c; | |||
| VkMat sqsum_workspace_reduced; | |||
| sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator); | |||
| { | |||
| std::vector<VkMat> bindings(2); | |||
| @@ -557,7 +557,7 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, | |||
| // coeffs | |||
| VkMat coeffs_workspace; | |||
| coeffs_workspace.create(c, elemsize * 2, elempack * 2, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| coeffs_workspace.create(c, elemsize * 2, elempack * 2, opt.workspace_vkallocator); | |||
| { | |||
| std::vector<VkMat> bindings(5); | |||
| bindings[0] = coeffs_workspace; | |||
| @@ -274,7 +274,7 @@ int Interp_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& | |||
| return 0; | |||
| } | |||
| top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -306,11 +306,11 @@ int Interp_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& | |||
| } | |||
| else if (resize_type == 3) // bicubic | |||
| { | |||
| VkMat alpha(outw, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| VkMat alpha(outw, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator); | |||
| if (alpha.empty()) | |||
| return -100; | |||
| VkMat xofs(outw, (size_t)4u, 1, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| VkMat xofs(outw, (size_t)4u, 1, opt.workspace_vkallocator); | |||
| if (xofs.empty()) | |||
| return -100; | |||
| @@ -328,11 +328,11 @@ int Interp_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& | |||
| cmd.record_pipeline(pipeline_interp_bicubic_coeffs_x, bindings, constants, alpha); | |||
| } | |||
| VkMat beta(outh, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| VkMat beta(outh, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator); | |||
| if (beta.empty()) | |||
| return -100; | |||
| VkMat yofs(outh, (size_t)4u, 1, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| VkMat yofs(outh, (size_t)4u, 1, opt.workspace_vkallocator); | |||
| if (yofs.empty()) | |||
| return -100; | |||
| @@ -254,11 +254,11 @@ int LRN_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Op | |||
| if (region_type == NormRegion_ACROSS_CHANNELS) | |||
| { | |||
| // always create scalar square workspace blob for norm across channel | |||
| square_workspace.create(w, h, channels * elempack + local_size - 1, 4u, 1, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| square_workspace.create(w, h, channels * elempack + local_size - 1, 4u, 1, opt.workspace_vkallocator); | |||
| } | |||
| else if (region_type == NormRegion_WITHIN_CHANNEL) | |||
| { | |||
| square_workspace.create(w + local_size - 1, h + local_size - 1, channels, elempack * 4u, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| square_workspace.create(w + local_size - 1, h + local_size - 1, channels, elempack * 4u, elempack, opt.workspace_vkallocator); | |||
| } | |||
| // square pad | |||
| @@ -298,7 +298,7 @@ int Normalize_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co | |||
| reduced_c = (bottom_top_blob.c + 3) / 4; | |||
| } | |||
| sqsum_workspace.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sqsum_workspace.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator); | |||
| { | |||
| std::vector<VkMat> bindings(2); | |||
| bindings[0] = bottom_top_blob; | |||
| @@ -347,7 +347,7 @@ int Normalize_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co | |||
| } | |||
| VkMat sqsum_workspace_reduced; | |||
| sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u*elempack, elempack, opt.workspace_vkallocator); | |||
| { | |||
| std::vector<VkMat> bindings(2); | |||
| @@ -377,7 +377,7 @@ int Normalize_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co | |||
| // coeffs | |||
| VkMat coeffs_workspace; | |||
| coeffs_workspace.create(sqsum_workspace.w * sqsum_workspace.h * sqsum_workspace.c, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| coeffs_workspace.create(sqsum_workspace.w * sqsum_workspace.h * sqsum_workspace.c, elemsize, elempack, opt.workspace_vkallocator); | |||
| { | |||
| std::vector<VkMat> bindings(2); | |||
| bindings[0] = sqsum_workspace; | |||
| @@ -203,7 +203,7 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| @@ -219,7 +219,7 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| @@ -235,7 +235,7 @@ int Packing_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| @@ -170,7 +170,7 @@ int Padding_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| int outw = w + left + right; | |||
| int outh = h + top + bottom; | |||
| top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -239,7 +239,7 @@ int Padding_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector< | |||
| int outw = w + _left + _right; | |||
| int outh = h + _top + _bottom; | |||
| top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -270,7 +270,7 @@ int Permute_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| @@ -329,7 +329,7 @@ int Permute_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| } | |||
| @@ -200,7 +200,7 @@ int PixelShuffle_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -287,7 +287,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| if (global_pooling) | |||
| { | |||
| top_blob.create(channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -295,7 +295,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| bindings[0] = bottom_blob; | |||
| bindings[1] = top_blob; | |||
| std::vector<vk_constant_type> constants(12); | |||
| std::vector<vk_constant_type> constants(10); | |||
| constants[0].i = bottom_blob.dims; | |||
| constants[1].i = bottom_blob.w; | |||
| constants[2].i = bottom_blob.h; | |||
| @@ -306,8 +306,6 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| constants[7].i = top_blob.h; | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = top_blob.cstep; | |||
| constants[10].i = 0; | |||
| constants[11].i = 0; | |||
| const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_global_pack8 | |||
| : elempack == 4 ? pipeline_pooling_global_pack4 | |||
| @@ -336,8 +334,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); | |||
| padding_param_blob.prepare_staging_buffer(); | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = pad_top; | |||
| @@ -369,8 +366,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); | |||
| padding_param_blob.prepare_staging_buffer(); | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = hpad / 2; | |||
| @@ -396,8 +392,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); | |||
| padding_param_blob.prepare_staging_buffer(); | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = hpad - hpad / 2; | |||
| @@ -421,7 +416,7 @@ int Pooling_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| int outw = (w - kernel_w) / stride_w + 1; | |||
| int outh = (h - kernel_h) / stride_h + 1; | |||
| top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -163,7 +163,7 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector | |||
| } | |||
| VkMat& top_blob = top_blobs[0]; | |||
| top_blob.create(4 * w * h * num_prior / elempack, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(4 * w * h * num_prior / elempack, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -217,7 +217,7 @@ int PriorBox_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector | |||
| } | |||
| VkMat& top_blob = top_blobs[0]; | |||
| top_blob.create(4 * w * h * num_prior, 2, elemsize, 1, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(4 * w * h * num_prior, 2, elemsize, 1, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -192,7 +192,7 @@ int Reorg_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& | |||
| if (out_elempack == 1) out_elemsize = 4u; | |||
| } | |||
| top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -275,7 +275,7 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| return 0; | |||
| } | |||
| top_blob.create(_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| } | |||
| else if (ndim == 2) | |||
| { | |||
| @@ -308,7 +308,7 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| return 0; | |||
| } | |||
| top_blob.create(_w, _h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(_w, _h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| } | |||
| else // if (ndim == 3) | |||
| { | |||
| @@ -348,7 +348,7 @@ int Reshape_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute | |||
| return 0; | |||
| } | |||
| top_blob.create(_w, _h, _c / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(_w, _h, _c / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| } | |||
| if (top_blob.empty()) | |||
| @@ -142,7 +142,7 @@ int ShuffleChannel_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, Vk | |||
| size_t elemsize = bottom_blob.elemsize; | |||
| int elempack = bottom_blob.elempack; | |||
| top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -314,7 +314,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk | |||
| } | |||
| VkMat& top_blob = top_blobs[i]; | |||
| top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -416,7 +416,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk | |||
| } | |||
| VkMat& top_blob = top_blobs[i]; | |||
| top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -508,7 +508,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk | |||
| } | |||
| VkMat& top_blob = top_blobs[i]; | |||
| top_blob.create(slice, h, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(slice, h, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -576,7 +576,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk | |||
| } | |||
| VkMat& top_blob = top_blobs[i]; | |||
| top_blob.create(w, h, slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(w, h, slice / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -669,7 +669,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk | |||
| } | |||
| VkMat& top_blob = top_blobs[i]; | |||
| top_blob.create(w, slice, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(w, slice, channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -727,7 +727,7 @@ int Slice_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<Vk | |||
| } | |||
| VkMat& top_blob = top_blobs[i]; | |||
| top_blob.create(slice, h, channels, elemsize, elempack, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| top_blob.create(slice, h, channels, elemsize, elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| @@ -283,33 +283,33 @@ int Softmax_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, cons | |||
| if (dims == 1) // axis == 0 | |||
| { | |||
| max_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| max_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator); | |||
| sum_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator); | |||
| } | |||
| else if (dims == 2 && axis == 0) | |||
| { | |||
| max_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| max_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator); | |||
| sum_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator); | |||
| } | |||
| else if (dims == 2 && axis == 1) | |||
| { | |||
| max_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| max_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator); | |||
| sum_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator); | |||
| } | |||
| else if (dims == 3 && axis == 0) | |||
| { | |||
| max_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| max_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator); | |||
| sum_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator); | |||
| } | |||
| else if (dims == 3 && axis == 1) | |||
| { | |||
| max_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| max_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator); | |||
| sum_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator); | |||
| } | |||
| else if (dims == 3 && axis == 2) | |||
| { | |||
| max_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| sum_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| max_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator); | |||
| sum_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator); | |||
| } | |||
| // reduce max | |||
| @@ -261,59 +261,51 @@ public: | |||
| // empty | |||
| VkMat(); | |||
| // vec | |||
| VkMat(int w, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| VkMat(int w, size_t elemsize, VkAllocator* allocator); | |||
| // image | |||
| VkMat(int w, int h, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| VkMat(int w, int h, size_t elemsize, VkAllocator* allocator); | |||
| // dim | |||
| VkMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| VkMat(int w, int h, int c, size_t elemsize, VkAllocator* allocator); | |||
| // packed vec | |||
| VkMat(int w, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| VkMat(int w, size_t elemsize, int elempack, VkAllocator* allocator); | |||
| // packed image | |||
| VkMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| VkMat(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator); | |||
| // packed dim | |||
| VkMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| VkMat(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator); | |||
| // copy | |||
| VkMat(const VkMat& m); | |||
| // external vec | |||
| VkMat(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| VkMat(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator); | |||
| // external image | |||
| VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator); | |||
| // external dim | |||
| VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator); | |||
| // external packed vec | |||
| VkMat(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| VkMat(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator); | |||
| // external packed image | |||
| VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| VkMat(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator); | |||
| // external packed dim | |||
| VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| VkMat(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator); | |||
| // release | |||
| ~VkMat(); | |||
| // assign | |||
| VkMat& operator=(const VkMat& m); | |||
| // allocate vec | |||
| void create(int w, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| void create(int w, size_t elemsize, VkAllocator* allocator); | |||
| // allocate image | |||
| void create(int w, int h, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| void create(int w, int h, size_t elemsize, VkAllocator* allocator); | |||
| // allocate dim | |||
| void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator); | |||
| // allocate packed vec | |||
| void create(int w, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| void create(int w, size_t elemsize, int elempack, VkAllocator* allocator); | |||
| // allocate packed image | |||
| void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator); | |||
| // allocate packed dim | |||
| void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator); | |||
| // allocate like | |||
| void create_like(const Mat& m, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| void create_like(const Mat& m, VkAllocator* allocator); | |||
| // allocate like | |||
| void create_like(const VkMat& m, VkAllocator* allocator, VkAllocator* staging_allocator); | |||
| // staging buffer | |||
| void prepare_staging_buffer(); | |||
| void discard_staging_buffer(); | |||
| // copy | |||
| void upload(const Mat& m); | |||
| void download(Mat& m) const; | |||
| void create_like(const VkMat& m, VkAllocator* allocator); | |||
| // mapped | |||
| Mat mapped() const; | |||
| @@ -333,19 +325,14 @@ public: | |||
| // low-level reference | |||
| VkBuffer buffer() const; | |||
| size_t buffer_offset() const; | |||
| VkBuffer staging_buffer() const; | |||
| size_t staging_buffer_offset() const; | |||
| size_t buffer_capacity() const; | |||
| // device buffer | |||
| VkBufferMemory* data; | |||
| // staging buffer | |||
| VkBufferMemory* staging_data; | |||
| // pointer to the reference counter | |||
| // when points to user-allocated data, the pointer is NULL | |||
| int* refcount; | |||
| int* staging_refcount; | |||
| // element size in bytes | |||
| // 4 = float32/int32 | |||
| @@ -362,7 +349,6 @@ public: | |||
| // the allocator | |||
| VkAllocator* allocator; | |||
| VkAllocator* staging_allocator; | |||
| // the dimension rank | |||
| int dims; | |||
| @@ -1234,90 +1220,87 @@ inline const float& Mat::operator[](size_t i) const | |||
| #if NCNN_VULKAN | |||
| inline VkMat::VkMat() | |||
| : data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) | |||
| : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) | |||
| { | |||
| } | |||
| inline VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| : data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) | |||
| inline VkMat::VkMat(int _w, size_t _elemsize, VkAllocator* _allocator) | |||
| : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) | |||
| { | |||
| create(_w, _elemsize, _allocator, _staging_allocator); | |||
| create(_w, _elemsize, _allocator); | |||
| } | |||
| inline VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| : data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) | |||
| inline VkMat::VkMat(int _w, int _h, size_t _elemsize, VkAllocator* _allocator) | |||
| : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) | |||
| { | |||
| create(_w, _h, _elemsize, _allocator, _staging_allocator); | |||
| create(_w, _h, _elemsize, _allocator); | |||
| } | |||
| inline VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| : data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) | |||
| inline VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator) | |||
| : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) | |||
| { | |||
| create(_w, _h, _c, _elemsize, _allocator, _staging_allocator); | |||
| create(_w, _h, _c, _elemsize, _allocator); | |||
| } | |||
| inline VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| : data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) | |||
| inline VkMat::VkMat(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator) | |||
| : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) | |||
| { | |||
| create(_w, _elemsize, _elempack, _allocator, _staging_allocator); | |||
| create(_w, _elemsize, _elempack, _allocator); | |||
| } | |||
| inline VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| : data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) | |||
| inline VkMat::VkMat(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator) | |||
| : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) | |||
| { | |||
| create(_w, _h, _elemsize, _elempack, _allocator, _staging_allocator); | |||
| create(_w, _h, _elemsize, _elempack, _allocator); | |||
| } | |||
| inline VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| : data(0), staging_data(0), refcount(0), staging_refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) | |||
| inline VkMat::VkMat(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator) | |||
| : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) | |||
| { | |||
| create(_w, _h, _c, _elemsize, _elempack, _allocator, _staging_allocator); | |||
| create(_w, _h, _c, _elemsize, _elempack, _allocator); | |||
| } | |||
| inline VkMat::VkMat(const VkMat& m) | |||
| : data(m.data), staging_data(m.staging_data), refcount(m.refcount), staging_refcount(m.staging_refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), staging_allocator(m.staging_allocator), dims(m.dims), w(m.w), h(m.h), c(m.c) | |||
| : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), c(m.c) | |||
| { | |||
| if (refcount) | |||
| NCNN_XADD(refcount, 1); | |||
| if (staging_refcount) | |||
| NCNN_XADD(staging_refcount, 1); | |||
| cstep = m.cstep; | |||
| } | |||
| inline VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| : data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), staging_allocator(_staging_allocator), dims(1), w(_w), h(1), c(1) | |||
| inline VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator) | |||
| : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), c(1) | |||
| { | |||
| cstep = w; | |||
| } | |||
| inline VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| : data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), staging_allocator(_staging_allocator), dims(2), w(_w), h(_h), c(1) | |||
| inline VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator) | |||
| : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), c(1) | |||
| { | |||
| cstep = w * h; | |||
| } | |||
| inline VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| : data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), staging_allocator(_staging_allocator), dims(3), w(_w), h(_h), c(_c) | |||
| inline VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator) | |||
| : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), c(_c) | |||
| { | |||
| cstep = alignSize(w * h * elemsize, 16) / elemsize; | |||
| } | |||
| inline VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| : data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), staging_allocator(_staging_allocator), dims(1), w(_w), h(1), c(1) | |||
| inline VkMat::VkMat(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator) | |||
| : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), c(1) | |||
| { | |||
| cstep = w; | |||
| } | |||
| inline VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| : data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), staging_allocator(_staging_allocator), dims(2), w(_w), h(_h), c(1) | |||
| inline VkMat::VkMat(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator) | |||
| : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), c(1) | |||
| { | |||
| cstep = w * h; | |||
| } | |||
| inline VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| : data(_data), staging_data(0), refcount(0), staging_refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), staging_allocator(_staging_allocator), dims(3), w(_w), h(_h), c(_c) | |||
| inline VkMat::VkMat(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator) | |||
| : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), c(_c) | |||
| { | |||
| cstep = alignSize(w * h * elemsize, 16) / elemsize; | |||
| } | |||
| @@ -1335,19 +1318,13 @@ inline VkMat& VkMat::operator=(const VkMat& m) | |||
| if (m.refcount) | |||
| NCNN_XADD(m.refcount, 1); | |||
| if (m.staging_refcount) | |||
| NCNN_XADD(m.staging_refcount, 1); | |||
| release(); | |||
| data = m.data; | |||
| staging_data = m.staging_data; | |||
| refcount = m.refcount; | |||
| staging_refcount = m.staging_refcount; | |||
| elemsize = m.elemsize; | |||
| elempack = m.elempack; | |||
| allocator = m.allocator; | |||
| staging_allocator = m.staging_allocator; | |||
| dims = m.dims; | |||
| w = m.w; | |||
| @@ -1359,9 +1336,9 @@ inline VkMat& VkMat::operator=(const VkMat& m) | |||
| return *this; | |||
| } | |||
| inline void VkMat::create(int _w, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| inline void VkMat::create(int _w, size_t _elemsize, VkAllocator* _allocator) | |||
| { | |||
| if (dims == 1 && w == _w && elemsize == _elemsize && elempack == 1 && allocator == _allocator && staging_allocator == _staging_allocator) | |||
| if (dims == 1 && w == _w && elemsize == _elemsize && elempack == 1 && allocator == _allocator) | |||
| return; | |||
| release(); | |||
| @@ -1369,7 +1346,6 @@ inline void VkMat::create(int _w, size_t _elemsize, VkAllocator* _allocator, VkA | |||
| elemsize = _elemsize; | |||
| elempack = 1; | |||
| allocator = _allocator; | |||
| staging_allocator = _staging_allocator; | |||
| dims = 1; | |||
| w = _w; | |||
| @@ -1389,9 +1365,9 @@ inline void VkMat::create(int _w, size_t _elemsize, VkAllocator* _allocator, VkA | |||
| } | |||
| } | |||
| inline void VkMat::create(int _w, int _h, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| inline void VkMat::create(int _w, int _h, size_t _elemsize, VkAllocator* _allocator) | |||
| { | |||
| if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == 1 && allocator == _allocator && staging_allocator == _staging_allocator) | |||
| if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == 1 && allocator == _allocator) | |||
| return; | |||
| release(); | |||
| @@ -1399,7 +1375,6 @@ inline void VkMat::create(int _w, int _h, size_t _elemsize, VkAllocator* _alloca | |||
| elemsize = _elemsize; | |||
| elempack = 1; | |||
| allocator = _allocator; | |||
| staging_allocator = _staging_allocator; | |||
| dims = 2; | |||
| w = _w; | |||
| @@ -1419,9 +1394,9 @@ inline void VkMat::create(int _w, int _h, size_t _elemsize, VkAllocator* _alloca | |||
| } | |||
| } | |||
| inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator) | |||
| { | |||
| if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == 1 && allocator == _allocator && staging_allocator == _staging_allocator) | |||
| if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == 1 && allocator == _allocator) | |||
| return; | |||
| release(); | |||
| @@ -1429,7 +1404,6 @@ inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator* | |||
| elemsize = _elemsize; | |||
| elempack = 1; | |||
| allocator = _allocator; | |||
| staging_allocator = _staging_allocator; | |||
| dims = 3; | |||
| w = _w; | |||
| @@ -1449,9 +1423,9 @@ inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator* | |||
| } | |||
| } | |||
| inline void VkMat::create(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| inline void VkMat::create(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator) | |||
| { | |||
| if (dims == 1 && w == _w && elemsize == _elemsize && elempack == _elempack && allocator == _allocator && staging_allocator == _staging_allocator) | |||
| if (dims == 1 && w == _w && elemsize == _elemsize && elempack == _elempack && allocator == _allocator) | |||
| return; | |||
| release(); | |||
| @@ -1459,7 +1433,6 @@ inline void VkMat::create(int _w, size_t _elemsize, int _elempack, VkAllocator* | |||
| elemsize = _elemsize; | |||
| elempack = _elempack; | |||
| allocator = _allocator; | |||
| staging_allocator = _staging_allocator; | |||
| dims = 1; | |||
| w = _w; | |||
| @@ -1479,9 +1452,9 @@ inline void VkMat::create(int _w, size_t _elemsize, int _elempack, VkAllocator* | |||
| } | |||
| } | |||
| inline void VkMat::create(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| inline void VkMat::create(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator) | |||
| { | |||
| if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == _elempack && allocator == _allocator && staging_allocator == _staging_allocator) | |||
| if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == _elempack && allocator == _allocator) | |||
| return; | |||
| release(); | |||
| @@ -1489,7 +1462,6 @@ inline void VkMat::create(int _w, int _h, size_t _elemsize, int _elempack, VkAll | |||
| elemsize = _elemsize; | |||
| elempack = _elempack; | |||
| allocator = _allocator; | |||
| staging_allocator = _staging_allocator; | |||
| dims = 2; | |||
| w = _w; | |||
| @@ -1509,9 +1481,9 @@ inline void VkMat::create(int _w, int _h, size_t _elemsize, int _elempack, VkAll | |||
| } | |||
| } | |||
| inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator) | |||
| { | |||
| if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == _elempack && allocator == _allocator && staging_allocator == _staging_allocator) | |||
| if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == _elempack && allocator == _allocator) | |||
| return; | |||
| release(); | |||
| @@ -1519,7 +1491,6 @@ inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempac | |||
| elemsize = _elemsize; | |||
| elempack = _elempack; | |||
| allocator = _allocator; | |||
| staging_allocator = _staging_allocator; | |||
| dims = 3; | |||
| w = _w; | |||
| @@ -1539,82 +1510,33 @@ inline void VkMat::create(int _w, int _h, int _c, size_t _elemsize, int _elempac | |||
| } | |||
| } | |||
| inline void VkMat::create_like(const Mat& m, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| inline void VkMat::create_like(const Mat& m, VkAllocator* _allocator) | |||
| { | |||
| int _dims = m.dims; | |||
| if (_dims == 1) | |||
| create(m.w, m.elemsize, m.elempack, _allocator, _staging_allocator); | |||
| create(m.w, m.elemsize, m.elempack, _allocator); | |||
| if (_dims == 2) | |||
| create(m.w, m.h, m.elemsize, m.elempack, _allocator, _staging_allocator); | |||
| create(m.w, m.h, m.elemsize, m.elempack, _allocator); | |||
| if (_dims == 3) | |||
| create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator, _staging_allocator); | |||
| create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator); | |||
| } | |||
| inline void VkMat::create_like(const VkMat& m, VkAllocator* _allocator, VkAllocator* _staging_allocator) | |||
| inline void VkMat::create_like(const VkMat& m, VkAllocator* _allocator) | |||
| { | |||
| int _dims = m.dims; | |||
| if (_dims == 1) | |||
| create(m.w, m.elemsize, m.elempack, _allocator, _staging_allocator); | |||
| create(m.w, m.elemsize, m.elempack, _allocator); | |||
| if (_dims == 2) | |||
| create(m.w, m.h, m.elemsize, m.elempack, _allocator, _staging_allocator); | |||
| create(m.w, m.h, m.elemsize, m.elempack, _allocator); | |||
| if (_dims == 3) | |||
| create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator, _staging_allocator); | |||
| } | |||
| inline void VkMat::prepare_staging_buffer() | |||
| { | |||
| if (allocator->mappable) | |||
| return; | |||
| if (staging_allocator && staging_data) | |||
| return; | |||
| size_t totalsize = alignSize(total() * elemsize, 4); | |||
| staging_data = staging_allocator->fastMalloc(totalsize); | |||
| staging_refcount = (int*)((unsigned char*)staging_data + offsetof(VkBufferMemory, refcount)); | |||
| *staging_refcount = 1; | |||
| } | |||
| inline void VkMat::discard_staging_buffer() | |||
| { | |||
| if (allocator->mappable) | |||
| return; | |||
| if (staging_refcount && NCNN_XADD(staging_refcount, -1) == 1) | |||
| { | |||
| if (staging_allocator && staging_data) | |||
| { | |||
| staging_allocator->fastFree(staging_data); | |||
| } | |||
| } | |||
| staging_data = 0; | |||
| staging_refcount = 0; | |||
| } | |||
| inline void VkMat::upload(const Mat& m) | |||
| { | |||
| memcpy(mapped_ptr(), m.data, m.total() * m.elemsize); | |||
| if (allocator->mappable) | |||
| { | |||
| allocator->flush(data); | |||
| } | |||
| } | |||
| inline void VkMat::download(Mat& m) const | |||
| { | |||
| if (allocator->mappable) | |||
| { | |||
| allocator->invalidate(data); | |||
| } | |||
| memcpy(m.data, mapped_ptr(), total() * elemsize); | |||
| create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator); | |||
| } | |||
| inline Mat VkMat::mapped() const | |||
| { | |||
| if (!allocator->mappable) | |||
| return Mat(); | |||
| if (dims == 1) | |||
| return Mat(w, mapped_ptr(), elemsize, elempack, 0); | |||
| @@ -1629,17 +1551,16 @@ inline Mat VkMat::mapped() const | |||
| inline void* VkMat::mapped_ptr() const | |||
| { | |||
| VkBufferMemory* mappable_data = allocator->mappable ? data : staging_data; | |||
| return (unsigned char*)mappable_data->mapped_ptr + mappable_data->offset; | |||
| if (!allocator->mappable) | |||
| return 0; | |||
| return (unsigned char*)data->mapped_ptr + data->offset; | |||
| } | |||
| inline void VkMat::addref() | |||
| { | |||
| if (refcount) | |||
| NCNN_XADD(refcount, 1); | |||
| if (staging_refcount) | |||
| NCNN_XADD(staging_refcount, 1); | |||
| } | |||
| inline void VkMat::release() | |||
| @@ -1652,16 +1573,7 @@ inline void VkMat::release() | |||
| } | |||
| } | |||
| if (staging_refcount && NCNN_XADD(staging_refcount, -1) == 1) | |||
| { | |||
| if (staging_allocator && staging_data) | |||
| { | |||
| staging_allocator->fastFree(staging_data); | |||
| } | |||
| } | |||
| data = 0; | |||
| staging_data = 0; | |||
| elemsize = 0; | |||
| elempack = 0; | |||
| @@ -1674,7 +1586,6 @@ inline void VkMat::release() | |||
| cstep = 0; | |||
| refcount = 0; | |||
| staging_refcount = 0; | |||
| } | |||
| inline bool VkMat::empty() const | |||
| @@ -1709,14 +1620,9 @@ inline size_t VkMat::buffer_offset() const | |||
| return data->offset; | |||
| } | |||
| inline VkBuffer VkMat::staging_buffer() const | |||
| { | |||
| return staging_data->buffer; | |||
| } | |||
| inline size_t VkMat::staging_buffer_offset() const | |||
| inline size_t VkMat::buffer_capacity() const | |||
| { | |||
| return staging_data->offset; | |||
| return data->capacity; | |||
| } | |||
| inline VkImageMat::VkImageMat() | |||
| @@ -924,14 +924,16 @@ int Net::upload_model() | |||
| weight_staging_vkallocator = new VkWeightStagingBufferAllocator(vkdev); | |||
| } | |||
| cmd.weight_vkallocator = weight_vkallocator; | |||
| cmd.staging_vkallocator = weight_staging_vkallocator; | |||
| Option opt_upload = opt; | |||
| opt_upload.blob_vkallocator = weight_vkallocator; | |||
| opt_upload.workspace_vkallocator = weight_vkallocator; | |||
| opt_upload.staging_vkallocator = weight_staging_vkallocator; | |||
| for (size_t i=0; i<layers.size(); i++) | |||
| { | |||
| if (layers[i]->support_vulkan) | |||
| { | |||
| int uret = layers[i]->upload_model(cmd, opt); | |||
| int uret = layers[i]->upload_model(cmd, opt_upload); | |||
| if (uret != 0) | |||
| { | |||
| fprintf(stderr, "layer upload_model %d failed\n", (int)i); | |||
| @@ -1347,12 +1349,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| // upload | |||
| VkMat bottom_blob_unpacked; | |||
| bottom_blob_unpacked.create_like(bottom_blob_cpu_fp16, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| bottom_blob_unpacked.prepare_staging_buffer(); | |||
| bottom_blob_unpacked.upload(bottom_blob_cpu_fp16); | |||
| cmd.record_upload(bottom_blob_unpacked); | |||
| cmd.record_upload(bottom_blob_cpu_fp16, bottom_blob_unpacked, opt); | |||
| // cast to fp16 (integrated gpu) | |||
| VkMat bottom_blob_unpacked_fp16; | |||
| @@ -1390,11 +1387,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| if (layer->support_inplace && *bottom_blob.refcount != 1) | |||
| { | |||
| VkMat bottom_blob_copy; | |||
| bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator); | |||
| cmd.record_clone(bottom_blob, bottom_blob_copy, opt); | |||
| // fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); | |||
| cmd.record_clone(bottom_blob, bottom_blob_copy); | |||
| bottom_blob = bottom_blob_copy; | |||
| } | |||
| } | |||
| @@ -1437,7 +1431,6 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| { | |||
| // load bottom blobs | |||
| std::vector<VkMat> bottom_blobs(layer->bottoms.size()); | |||
| std::vector<VkMat> bottom_blobs_unpacked(layer->bottoms.size()); | |||
| for (size_t i=0; i<layer->bottoms.size(); i++) | |||
| { | |||
| int bottom_blob_index = layer->bottoms[i]; | |||
| @@ -1471,13 +1464,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| } | |||
| // upload | |||
| VkMat& bottom_blob_unpacked = bottom_blobs_unpacked[i]; | |||
| bottom_blob_unpacked.create_like(bottom_blob_cpu_fp16, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| bottom_blob_unpacked.prepare_staging_buffer(); | |||
| bottom_blob_unpacked.upload(bottom_blob_cpu_fp16); | |||
| cmd.record_upload(bottom_blob_unpacked); | |||
| VkMat bottom_blob_unpacked; | |||
| cmd.record_upload(bottom_blob_cpu_fp16, bottom_blob_unpacked, opt); | |||
| // cast to fp16 (integrated gpu) | |||
| VkMat bottom_blob_unpacked_fp16; | |||
| @@ -1515,11 +1503,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| if (layer->support_inplace && *bottom_blobs[i].refcount != 1) | |||
| { | |||
| VkMat bottom_blob_copy; | |||
| bottom_blob_copy.create_like(bottom_blobs[i], bottom_blobs[i].allocator, bottom_blobs[i].staging_allocator); | |||
| cmd.record_clone(bottom_blobs[i], bottom_blob_copy, opt); | |||
| // fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blobs[i].buffer(), bottom_blobs[i].buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); | |||
| cmd.record_clone(bottom_blobs[i], bottom_blob_copy); | |||
| bottom_blobs[i] = bottom_blob_copy; | |||
| } | |||
| } | |||
| @@ -1602,11 +1587,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| if (layer->support_inplace && *bottom_blob.refcount != 1) | |||
| { | |||
| VkMat bottom_blob_copy; | |||
| bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator); | |||
| cmd.record_clone(bottom_blob, bottom_blob_copy, opt); | |||
| // fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); | |||
| cmd.record_clone(bottom_blob, bottom_blob_copy); | |||
| bottom_blob = bottom_blob_copy; | |||
| } | |||
| } | |||
| @@ -1614,7 +1596,6 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| VkMat bottom_blob_unpacked_fp16; | |||
| if (opt.use_packing_layout && layer->support_packing) | |||
| { | |||
| // bottom_blob_unpacked_fp16 = bottom_blob; | |||
| packing_pack4->forward(bottom_blob, bottom_blob_unpacked_fp16, cmd, opt); | |||
| } | |||
| else | |||
| @@ -1635,8 +1616,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| } | |||
| // download | |||
| bottom_blob_unpacked.prepare_staging_buffer(); | |||
| cmd.record_download(bottom_blob_unpacked); | |||
| Mat bottom_blob_cpu_fp16; | |||
| cmd.record_download(bottom_blob_unpacked, bottom_blob_cpu_fp16, opt); | |||
| cmd.submit_and_wait(); | |||
| @@ -1657,12 +1638,6 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| cmd.reset(); | |||
| Mat bottom_blob_cpu_fp16; | |||
| bottom_blob_cpu_fp16.create_like(bottom_blob_unpacked, opt.blob_allocator); | |||
| bottom_blob_unpacked.download(bottom_blob_cpu_fp16); | |||
| bottom_blob_unpacked.discard_staging_buffer(); | |||
| // cast to fp32 (discrete gpu) | |||
| Mat& bottom_blob_cpu = blob_mats[bottom_blob_index]; | |||
| if (opt.use_fp16_storage && vkdev->info.type == 0) | |||
| @@ -1742,7 +1717,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| else | |||
| { | |||
| // load bottom blobs | |||
| std::vector<VkMat> bottom_blobs_unpacked(layer->bottoms.size()); | |||
| std::vector<Mat> bottom_blobs_cpu_fp16(layer->bottoms.size()); | |||
| for (size_t i=0; i<layer->bottoms.size(); i++) | |||
| { | |||
| int bottom_blob_index = layer->bottoms[i]; | |||
| @@ -1770,11 +1745,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| if (layer->support_inplace && *bottom_blob.refcount != 1) | |||
| { | |||
| VkMat bottom_blob_copy; | |||
| bottom_blob_copy.create_like(bottom_blob, bottom_blob.allocator, bottom_blob.staging_allocator); | |||
| cmd.record_clone(bottom_blob, bottom_blob_copy, opt); | |||
| // fprintf(stderr, "clone %p[+%lu] %p[+%lu]\n", bottom_blob.buffer(), bottom_blob.buffer_offset(), bottom_blob_copy.buffer(), bottom_blob_copy.buffer_offset()); | |||
| cmd.record_clone(bottom_blob, bottom_blob_copy); | |||
| bottom_blob = bottom_blob_copy; | |||
| } | |||
| } | |||
| @@ -1782,7 +1754,6 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| VkMat bottom_blob_unpacked_fp16; | |||
| if (opt.use_packing_layout && layer->support_packing) | |||
| { | |||
| // bottom_blob_unpacked_fp16 = bottom_blob; | |||
| packing_pack4->forward(bottom_blob, bottom_blob_unpacked_fp16, cmd, opt); | |||
| } | |||
| else | |||
| @@ -1792,7 +1763,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| } | |||
| // cast to fp32 (integrated gpu) | |||
| VkMat& bottom_blob_unpacked = bottom_blobs_unpacked[i]; | |||
| VkMat bottom_blob_unpacked; | |||
| if (opt.use_fp16_storage && vkdev->info.type != 0) | |||
| { | |||
| cast_float16_to_float32->forward(bottom_blob_unpacked_fp16, bottom_blob_unpacked, cmd, opt); | |||
| @@ -1803,8 +1774,8 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| } | |||
| // download | |||
| bottom_blob_unpacked.prepare_staging_buffer(); | |||
| cmd.record_download(bottom_blob_unpacked); | |||
| Mat& bottom_blob_cpu_fp16 = bottom_blobs_cpu_fp16[i]; | |||
| cmd.record_download(bottom_blob_unpacked, bottom_blob_cpu_fp16, opt); | |||
| } | |||
| } | |||
| } | |||
| @@ -1837,13 +1808,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| if (blob_mats[bottom_blob_index].dims == 0) | |||
| { | |||
| VkMat& bottom_blob_unpacked = bottom_blobs_unpacked[i]; | |||
| Mat bottom_blob_cpu_fp16; | |||
| bottom_blob_cpu_fp16.create_like(bottom_blob_unpacked, opt.blob_allocator); | |||
| bottom_blob_unpacked.download(bottom_blob_cpu_fp16); | |||
| bottom_blob_unpacked.discard_staging_buffer(); | |||
| const Mat& bottom_blob_cpu_fp16 = bottom_blobs_cpu_fp16[i]; | |||
| // cast to fp32 (discrete gpu) | |||
| Mat& bottom_blob_cpu = blob_mats[bottom_blob_index]; | |||
| @@ -1884,7 +1849,7 @@ int Net::forward_layer(int layer_index, std::vector<Mat>& blob_mats, std::vector | |||
| } | |||
| } | |||
| bottom_blobs_unpacked.clear(); | |||
| bottom_blobs_cpu_fp16.clear(); | |||
| // forward | |||
| if (opt.lightmode && layer->support_inplace) | |||
| @@ -2113,15 +2078,15 @@ int Extractor::extract(int blob_index, Mat& feat) | |||
| } | |||
| // download | |||
| feat_gpu_unpacked.prepare_staging_buffer(); | |||
| cmd.record_download(feat_gpu_unpacked); | |||
| Mat feat_cpu_fp16; | |||
| cmd.record_download(feat_gpu_unpacked, feat_cpu_fp16, opt); | |||
| cmd.submit_and_wait(); | |||
| #if NCNN_BENCHMARK | |||
| std::vector<uint64_t> results(net->layers.size() * 2); | |||
| cmd.get_query_pool_results(0, net->layers.size() * 2, results); | |||
| for (int i=0; i<net->layers.size(); i++) | |||
| for (size_t i=0; i<net->layers.size(); i++) | |||
| { | |||
| uint64_t start = results[i*2]; | |||
| uint64_t end = results[i*2+1]; | |||
| @@ -2133,12 +2098,6 @@ int Extractor::extract(int blob_index, Mat& feat) | |||
| } | |||
| #endif // NCNN_BENCHMARK | |||
| Mat feat_cpu_fp16; | |||
| feat_cpu_fp16.create_like(feat_gpu_unpacked, opt.blob_allocator); | |||
| feat_gpu_unpacked.download(feat_cpu_fp16); | |||
| feat_gpu_unpacked.discard_staging_buffer(); | |||
| // cast to fp32 (discrete gpu) | |||
| Mat& feat_cpu = blob_mats[blob_index]; | |||
| if (opt.use_fp16_storage && net->vkdev->info.type == 0) | |||
| @@ -267,7 +267,7 @@ int Pipeline::create_pipeline_layout(int push_constant_count) | |||
| VkPushConstantRange pushConstantRange; | |||
| pushConstantRange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; | |||
| pushConstantRange.offset = 0; | |||
| pushConstantRange.size = sizeof(int) * push_constant_count; | |||
| pushConstantRange.size = sizeof(vk_constant_type) * push_constant_count; | |||
| VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo; | |||
| pipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; | |||
| @@ -207,39 +207,30 @@ static int test_cast_gpu_fp16p(const ncnn::Mat& a, int type_from, int type_to) | |||
| a4_fp16 = a4; | |||
| } | |||
| // upload | |||
| ncnn::VkMat a4_gpu; | |||
| a4_gpu.create_like(a4_fp16, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| a4_gpu.prepare_staging_buffer(); | |||
| a4_gpu.upload(a4_fp16); | |||
| // forward | |||
| ncnn::VkCompute cmd(vkdev); | |||
| cmd.record_upload(a4_gpu); | |||
| // upload | |||
| ncnn::VkMat a4_gpu; | |||
| cmd.record_upload(a4_fp16, a4_gpu, opt); | |||
| ncnn::VkMat d4_gpu; | |||
| if (op->support_inplace) | |||
| { | |||
| d4_gpu.create_like(a4_gpu, a4_gpu.allocator, a4_gpu.staging_allocator); | |||
| cmd.record_clone(a4_gpu, d4_gpu); | |||
| op->forward_inplace(d4_gpu, cmd, opt); | |||
| op->forward_inplace(a4_gpu, cmd, opt); | |||
| d4_gpu = a4_gpu; | |||
| } | |||
| else | |||
| { | |||
| op->forward(a4_gpu, d4_gpu, cmd, opt); | |||
| } | |||
| d4_gpu.prepare_staging_buffer(); | |||
| cmd.record_download(d4_gpu); | |||
| // download | |||
| cmd.record_download(d4_gpu, d, opt); | |||
| cmd.submit_and_wait(); | |||
| // download | |||
| d.create_like(d4_gpu); | |||
| d4_gpu.download(d); | |||
| op->destroy_pipeline(opt); | |||
| delete op; | |||
| @@ -331,39 +322,30 @@ static int test_cast_gpu_fp16p_pack8(const ncnn::Mat& a, int type_from, int type | |||
| a4_fp16 = a4; | |||
| } | |||
| // upload | |||
| ncnn::VkMat a4_gpu; | |||
| a4_gpu.create_like(a4_fp16, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| a4_gpu.prepare_staging_buffer(); | |||
| a4_gpu.upload(a4_fp16); | |||
| // forward | |||
| ncnn::VkCompute cmd(vkdev); | |||
| cmd.record_upload(a4_gpu); | |||
| // upload | |||
| ncnn::VkMat a4_gpu; | |||
| cmd.record_upload(a4_fp16, a4_gpu, opt); | |||
| ncnn::VkMat d4_gpu; | |||
| if (op->support_inplace) | |||
| { | |||
| d4_gpu.create_like(a4_gpu, a4_gpu.allocator, a4_gpu.staging_allocator); | |||
| cmd.record_clone(a4_gpu, d4_gpu); | |||
| op->forward_inplace(d4_gpu, cmd, opt); | |||
| op->forward_inplace(a4_gpu, cmd, opt); | |||
| d4_gpu = a4_gpu; | |||
| } | |||
| else | |||
| { | |||
| op->forward(a4_gpu, d4_gpu, cmd, opt); | |||
| } | |||
| d4_gpu.prepare_staging_buffer(); | |||
| cmd.record_download(d4_gpu); | |||
| // download | |||
| cmd.record_download(d4_gpu, d, opt); | |||
| cmd.submit_and_wait(); | |||
| // download | |||
| d.create_like(d4_gpu); | |||
| d4_gpu.download(d); | |||
| op->destroy_pipeline(opt); | |||
| delete op; | |||
| @@ -261,10 +261,13 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn: | |||
| if (opt.use_vulkan_compute) | |||
| { | |||
| ncnn::VkTransfer cmd(vkdev); | |||
| cmd.weight_vkallocator = &g_weight_vkallocator; | |||
| cmd.staging_vkallocator = &g_weight_staging_vkallocator; | |||
| op->upload_model(cmd, opt); | |||
| ncnn::Option opt_upload = opt; | |||
| opt_upload.blob_vkallocator = &g_weight_vkallocator; | |||
| opt_upload.workspace_vkallocator = &g_weight_vkallocator; | |||
| opt_upload.staging_vkallocator = &g_weight_staging_vkallocator; | |||
| op->upload_model(cmd, opt_upload); | |||
| cmd.submit_and_wait(); | |||
| } | |||
| @@ -367,57 +370,35 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn: | |||
| } | |||
| } | |||
| // upload | |||
| std::vector<ncnn::VkMat> a4_fp16_gpu(a4_fp16.size()); | |||
| for (size_t i=0; i<a4_fp16.size(); i++) | |||
| { | |||
| a4_fp16_gpu[i].create_like(a4_fp16[i], opt.blob_vkallocator, opt.staging_vkallocator); | |||
| a4_fp16_gpu[i].prepare_staging_buffer(); | |||
| a4_fp16_gpu[i].upload(a4_fp16[i]); | |||
| } | |||
| // forward | |||
| ncnn::VkCompute cmd(vkdev); | |||
| // upload | |||
| std::vector<ncnn::VkMat> a4_fp16_gpu(a4_fp16.size()); | |||
| for (size_t i=0; i<a4_fp16_gpu.size(); i++) | |||
| { | |||
| cmd.record_upload(a4_fp16_gpu[i]); | |||
| cmd.record_upload(a4_fp16[i], a4_fp16_gpu[i], opt); | |||
| } | |||
| std::vector<ncnn::VkMat> d4_fp16_gpu(top_blob_count); | |||
| if (op->support_inplace) | |||
| { | |||
| for (size_t i=0; i<a4_fp16_gpu.size(); i++) | |||
| { | |||
| d4_fp16_gpu[i].create_like(a4_fp16_gpu[i], a4_fp16_gpu[i].allocator, a4_fp16_gpu[i].staging_allocator); | |||
| cmd.record_clone(a4_fp16_gpu[i], d4_fp16_gpu[i]); | |||
| } | |||
| op->forward_inplace(a4_fp16_gpu, cmd, opt); | |||
| op->forward_inplace(d4_fp16_gpu, cmd, opt); | |||
| d4_fp16_gpu = a4_fp16_gpu; | |||
| } | |||
| else | |||
| { | |||
| op->forward(a4_fp16_gpu, d4_fp16_gpu, cmd, opt); | |||
| } | |||
| // download | |||
| for (size_t i=0; i<d4_fp16_gpu.size(); i++) | |||
| { | |||
| d4_fp16_gpu[i].prepare_staging_buffer(); | |||
| } | |||
| for (size_t i=0; i<d4_fp16_gpu.size(); i++) | |||
| { | |||
| cmd.record_download(d4_fp16_gpu[i]); | |||
| cmd.record_download(d4_fp16_gpu[i], d[i], opt); | |||
| } | |||
| cmd.submit_and_wait(); | |||
| // download | |||
| for (size_t i=0; i<d4_fp16_gpu.size(); i++) | |||
| { | |||
| d[i].create_like(d4_fp16_gpu[i]); | |||
| d4_fp16_gpu[i].download(d[i]); | |||
| } | |||
| } | |||
| #endif // NCNN_VULKAN | |||
| @@ -509,14 +490,15 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn: | |||
| if (opt.use_vulkan_compute) | |||
| { | |||
| ncnn::VkTransfer cmd(vkdev); | |||
| cmd.weight_vkallocator = &g_weight_vkallocator; | |||
| cmd.staging_vkallocator = &g_weight_staging_vkallocator; | |||
| op->upload_model(cmd, opt); | |||
| ncnn::Option opt_upload = opt; | |||
| opt_upload.blob_vkallocator = &g_weight_vkallocator; | |||
| opt_upload.workspace_vkallocator = &g_weight_vkallocator; | |||
| opt_upload.staging_vkallocator = &g_weight_staging_vkallocator; | |||
| cmd.submit_and_wait(); | |||
| op->upload_model(cmd, opt_upload); | |||
| g_weight_staging_vkallocator.clear(); | |||
| cmd.submit_and_wait(); | |||
| } | |||
| #endif // NCNN_VULKAN | |||
| @@ -594,38 +576,29 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn: | |||
| a4_fp16 = a4; | |||
| } | |||
| // upload | |||
| ncnn::VkMat a4_fp16_gpu; | |||
| a4_fp16_gpu.create_like(a4_fp16, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| a4_fp16_gpu.prepare_staging_buffer(); | |||
| a4_fp16_gpu.upload(a4_fp16); | |||
| // forward | |||
| ncnn::VkCompute cmd(vkdev); | |||
| cmd.record_upload(a4_fp16_gpu); | |||
| // upload | |||
| ncnn::VkMat a4_fp16_gpu; | |||
| cmd.record_upload(a4_fp16, a4_fp16_gpu, opt); | |||
| ncnn::VkMat d4_fp16_gpu; | |||
| if (op->support_inplace) | |||
| { | |||
| d4_fp16_gpu.create_like(a4_fp16_gpu, a4_fp16_gpu.allocator, a4_fp16_gpu.staging_allocator); | |||
| cmd.record_clone(a4_fp16_gpu, d4_fp16_gpu); | |||
| op->forward_inplace(d4_fp16_gpu, cmd, opt); | |||
| op->forward_inplace(a4_fp16_gpu, cmd, opt); | |||
| d4_fp16_gpu = a4_fp16_gpu; | |||
| } | |||
| else | |||
| { | |||
| op->forward(a4_fp16_gpu, d4_fp16_gpu, cmd, opt); | |||
| } | |||
| d4_fp16_gpu.prepare_staging_buffer(); | |||
| cmd.record_download(d4_fp16_gpu); | |||
| // download | |||
| cmd.record_download(d4_fp16_gpu, d, opt); | |||
| cmd.submit_and_wait(); | |||
| // download | |||
| d.create_like(d4_fp16_gpu); | |||
| d4_fp16_gpu.download(d); | |||
| } | |||
| #endif // NCNN_VULKAN | |||