// Tencent is pleased to support the open source community by making ncnn available. // // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at // // https://opensource.org/licenses/BSD-3-Clause // // Unless required by applicable law or agreed to in writing, software distributed // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. #include "allocator.h" #include #include #include "gpu.h" #include "pipeline.h" #if __ANDROID_API__ >= 26 #include #endif // __ANDROID_API__ >= 26 namespace ncnn { Allocator::~Allocator() { } PoolAllocator::PoolAllocator() { size_compare_ratio = 192;// 0.75f * 256 } PoolAllocator::~PoolAllocator() { clear(); if (!payouts.empty()) { fprintf(stderr, "FATAL ERROR! pool allocator destroyed too early\n"); std::list< std::pair >::iterator it = payouts.begin(); for (; it != payouts.end(); it++) { void* ptr = it->second; fprintf(stderr, "%p still in use\n", ptr); } } } void PoolAllocator::clear() { budgets_lock.lock(); std::list< std::pair >::iterator it = budgets.begin(); for (; it != budgets.end(); it++) { void* ptr = it->second; ncnn::fastFree(ptr); } budgets.clear(); budgets_lock.unlock(); } void PoolAllocator::set_size_compare_ratio(float scr) { if (scr < 0.f || scr > 1.f) { fprintf(stderr, "invalid size compare ratio %f\n", scr); return; } size_compare_ratio = (unsigned int)(scr * 256); } void* PoolAllocator::fastMalloc(size_t size) { budgets_lock.lock(); // find free budget std::list< std::pair >::iterator it = budgets.begin(); for (; it != budgets.end(); it++) { size_t bs = it->first; // size_compare_ratio ~ 100% if (bs >= size && ((bs * size_compare_ratio) >> 8) <= size) { void* ptr = it->second; budgets.erase(it); budgets_lock.unlock(); payouts_lock.lock(); payouts.push_back(std::make_pair(bs, ptr)); payouts_lock.unlock(); return ptr; } } budgets_lock.unlock(); // new void* ptr = ncnn::fastMalloc(size); payouts_lock.lock(); payouts.push_back(std::make_pair(size, ptr)); payouts_lock.unlock(); return ptr; } void PoolAllocator::fastFree(void* ptr) { payouts_lock.lock(); // return to budgets std::list< std::pair >::iterator it = payouts.begin(); for (; it != payouts.end(); it++) { if (it->second == ptr) { size_t size = it->first; payouts.erase(it); payouts_lock.unlock(); budgets_lock.lock(); budgets.push_back(std::make_pair(size, ptr)); budgets_lock.unlock(); return; } } payouts_lock.unlock(); fprintf(stderr, "FATAL ERROR! pool allocator get wild %p\n", ptr); ncnn::fastFree(ptr); } UnlockedPoolAllocator::UnlockedPoolAllocator() { size_compare_ratio = 192;// 0.75f * 256 } UnlockedPoolAllocator::~UnlockedPoolAllocator() { clear(); if (!payouts.empty()) { fprintf(stderr, "FATAL ERROR! unlocked pool allocator destroyed too early\n"); std::list< std::pair >::iterator it = payouts.begin(); for (; it != payouts.end(); it++) { void* ptr = it->second; fprintf(stderr, "%p still in use\n", ptr); } } } void UnlockedPoolAllocator::clear() { std::list< std::pair >::iterator it = budgets.begin(); for (; it != budgets.end(); it++) { void* ptr = it->second; ncnn::fastFree(ptr); } budgets.clear(); } void UnlockedPoolAllocator::set_size_compare_ratio(float scr) { if (scr < 0.f || scr > 1.f) { fprintf(stderr, "invalid size compare ratio %f\n", scr); return; } size_compare_ratio = (unsigned int)(scr * 256); } void* UnlockedPoolAllocator::fastMalloc(size_t size) { // find free budget std::list< std::pair >::iterator it = budgets.begin(); for (; it != budgets.end(); it++) { size_t bs = it->first; // size_compare_ratio ~ 100% if (bs >= size && ((bs * size_compare_ratio) >> 8) <= size) { void* ptr = it->second; budgets.erase(it); payouts.push_back(std::make_pair(bs, ptr)); return ptr; } } // new void* ptr = ncnn::fastMalloc(size); payouts.push_back(std::make_pair(size, ptr)); return ptr; } void UnlockedPoolAllocator::fastFree(void* ptr) { // return to budgets std::list< std::pair >::iterator it = payouts.begin(); for (; it != payouts.end(); it++) { if (it->second == ptr) { size_t size = it->first; payouts.erase(it); budgets.push_back(std::make_pair(size, ptr)); return; } } fprintf(stderr, "FATAL ERROR! unlocked pool allocator get wild %p\n", ptr); ncnn::fastFree(ptr); } #if NCNN_VULKAN VkAllocator::VkAllocator(const VulkanDevice* _vkdev) : vkdev(_vkdev) { memory_type_index = (uint32_t)-1; mappable = false; coherent = false; } static inline size_t round_up(size_t n, size_t multiple) { return (n + n - 1) / multiple * multiple; } static inline size_t round_down(size_t n, size_t multiple) { return n / multiple * multiple; } int VkAllocator::flush(VkBufferMemory* ptr) { if (coherent) return 0; VkMappedMemoryRange mappedMemoryRange; mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; mappedMemoryRange.pNext = 0; mappedMemoryRange.memory = ptr->memory; mappedMemoryRange.offset = round_down(ptr->offset, vkdev->info.non_coherent_atom_size); mappedMemoryRange.size = round_up(ptr->offset + ptr->capacity, vkdev->info.non_coherent_atom_size) - mappedMemoryRange.offset; VkResult ret = vkFlushMappedMemoryRanges(vkdev->vkdevice(), 1, &mappedMemoryRange); if (ret != VK_SUCCESS) { fprintf(stderr, "vkFlushMappedMemoryRanges failed %d\n", ret); return -1; } return 0; } int VkAllocator::invalidate(VkBufferMemory* ptr) { if (coherent) return 0; VkMappedMemoryRange mappedMemoryRange; mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; mappedMemoryRange.pNext = 0; mappedMemoryRange.memory = ptr->memory; mappedMemoryRange.offset = round_down(ptr->offset, vkdev->info.non_coherent_atom_size); mappedMemoryRange.size = round_up(ptr->offset + ptr->capacity, vkdev->info.non_coherent_atom_size) - mappedMemoryRange.offset; VkResult ret = vkInvalidateMappedMemoryRanges(vkdev->vkdevice(), 1, &mappedMemoryRange); if (ret != VK_SUCCESS) { fprintf(stderr, "vkInvalidateMappedMemoryRanges failed %d\n", ret); return -1; } return 0; } VkBuffer VkAllocator::create_buffer(size_t size, VkBufferUsageFlags usage) { VkBufferCreateInfo bufferCreateInfo; bufferCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; bufferCreateInfo.pNext = 0; bufferCreateInfo.flags = 0; bufferCreateInfo.size = size; bufferCreateInfo.usage = usage; bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; bufferCreateInfo.queueFamilyIndexCount = 0; bufferCreateInfo.pQueueFamilyIndices = 0; VkBuffer buffer = 0; VkResult ret = vkCreateBuffer(vkdev->vkdevice(), &bufferCreateInfo, 0, &buffer); if (ret != VK_SUCCESS) { fprintf(stderr, "vkCreateBuffer failed %d\n", ret); return 0; } return buffer; } VkDeviceMemory VkAllocator::allocate_memory(size_t size) { VkMemoryAllocateInfo memoryAllocateInfo; memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; memoryAllocateInfo.pNext = 0; memoryAllocateInfo.allocationSize = size; memoryAllocateInfo.memoryTypeIndex = memory_type_index; VkDeviceMemory memory = 0; VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory); if (ret != VK_SUCCESS) { fprintf(stderr, "vkAllocateMemory failed %d\n", ret); return 0; } return memory; } VkDeviceMemory VkAllocator::allocate_dedicated_memory(size_t size, VkBuffer buffer) { VkMemoryAllocateInfo memoryAllocateInfo; memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; memoryAllocateInfo.pNext = 0; memoryAllocateInfo.allocationSize = size; memoryAllocateInfo.memoryTypeIndex = memory_type_index; VkMemoryDedicatedAllocateInfoKHR memoryDedicatedAllocateInfo; memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR; memoryDedicatedAllocateInfo.pNext = 0; memoryDedicatedAllocateInfo.image = 0; memoryDedicatedAllocateInfo.buffer = buffer; memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo; VkDeviceMemory memory = 0; VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory); if (ret != VK_SUCCESS) { fprintf(stderr, "vkAllocateMemory failed %d\n", ret); return 0; } return memory; } static inline size_t least_common_multiple(size_t a, size_t b) { if (a == b) return a; if (a > b) return least_common_multiple(b, a); size_t lcm = b; while (lcm % a != 0) { lcm += b; } return lcm; } VkBlobBufferAllocator::VkBlobBufferAllocator(const VulkanDevice* _vkdev) : VkAllocator(_vkdev) { buffer_offset_alignment = vkdev->info.buffer_offset_alignment; if (vkdev->info.type == 1) { // on integrated gpu, there may be device local only memory too, eg. AMD APU // assuming larger alignment always keeps us safe :) // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.memory_map_alignment); buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.non_coherent_atom_size); } block_size = alignSize(16 * 1024 * 1024, buffer_offset_alignment);// 16M } VkBlobBufferAllocator::~VkBlobBufferAllocator() { clear(); } void VkBlobBufferAllocator::clear() { // fprintf(stderr, "VkBlobBufferAllocator %lu\n", buffer_blocks.size()); for (size_t i=0; i >::iterator it = budgets[i].begin(); // while (it != budgets[i].end()) // { // fprintf(stderr, "VkBlobBufferAllocator budget %p %lu %lu\n", ptr->buffer, it->first, it->second); // it++; // } if (mappable) vkUnmapMemory(vkdev->vkdevice(), ptr->memory); vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); delete ptr; } buffer_blocks.clear(); budgets.clear(); } VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size) { size_t aligned_size = alignSize(size, buffer_offset_alignment); const int buffer_block_count = buffer_blocks.size(); // find first spare space in buffer_blocks for (int i=0; i >::iterator it = budgets[i].begin(); while (it != budgets[i].end()) { size_t budget_size = it->second; if (budget_size < aligned_size) { it++; continue; } // return sub buffer VkBufferMemory* ptr = new VkBufferMemory; ptr->buffer = buffer_blocks[i]->buffer; ptr->offset = it->first; ptr->memory = buffer_blocks[i]->memory; ptr->capacity = aligned_size; ptr->mapped_ptr = buffer_blocks[i]->mapped_ptr; ptr->state = 1; // adjust budgets if (budget_size == aligned_size) { budgets[i].erase(it); } else { it->first += aligned_size; it->second -= aligned_size; } // fprintf(stderr, "VkBlobBufferAllocator M %p +%lu %lu\n", ptr->buffer, ptr->offset, ptr->capacity); return ptr; } } size_t new_block_size = std::max(block_size, aligned_size); // create new block VkBufferMemory* block = new VkBufferMemory; block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT); block->offset = 0; // TODO respect VK_KHR_dedicated_allocation ? VkMemoryRequirements memoryRequirements; vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements); // setup memory type and alignment if (memory_type_index == (uint32_t)-1) { if (vkdev->info.type == 1) { // integrated gpu, prefer unified memory memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); } else { // discrete gpu, device local memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); } mappable = vkdev->is_mappable(memory_type_index); coherent = vkdev->is_coherent(memory_type_index); } block->memory = allocate_memory(memoryRequirements.size); vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0); block->mapped_ptr = 0; if (mappable) { vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr); } buffer_blocks.push_back(block); // return sub buffer VkBufferMemory* ptr = new VkBufferMemory; ptr->buffer = block->buffer; ptr->offset = 0; ptr->memory = block->memory; ptr->capacity = aligned_size; ptr->mapped_ptr = block->mapped_ptr; ptr->state = 1; // adjust budgets std::list< std::pair > budget; if (new_block_size > aligned_size) { budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size)); } budgets.push_back(budget); // fprintf(stderr, "VkBlobBufferAllocator M %p +%lu %lu\n", ptr->buffer, ptr->offset, ptr->capacity); return ptr; } void VkBlobBufferAllocator::fastFree(VkBufferMemory* ptr) { // fprintf(stderr, "VkBlobBufferAllocator F %p +%lu %lu\n", ptr->buffer, ptr->offset, ptr->capacity); const int buffer_block_count = buffer_blocks.size(); int block_index = -1; for (int i=0; ibuffer == ptr->buffer && buffer_blocks[i]->memory == ptr->memory) { block_index = i; break; } } if (block_index == -1) { fprintf(stderr, "FATAL ERROR! unlocked VkBlobBufferAllocator get wild %p\n", ptr->buffer); delete ptr; return; } // merge std::list< std::pair >::iterator it_merge_left = budgets[block_index].end(); std::list< std::pair >::iterator it_merge_right = budgets[block_index].end(); std::list< std::pair >::iterator it = budgets[block_index].begin(); for ( ; it != budgets[block_index].end(); it++) { if (it->first + it->second == ptr->offset) { it_merge_left = it; } else if (ptr->offset + ptr->capacity == it->first) { it_merge_right = it; } } if (it_merge_left != budgets[block_index].end() && it_merge_right != budgets[block_index].end()) { it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first; budgets[block_index].erase(it_merge_right); } else if (it_merge_left != budgets[block_index].end()) { it_merge_left->second = ptr->offset + ptr->capacity - it_merge_left->first; } else if (it_merge_right != budgets[block_index].end()) { it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->offset; it_merge_right->first = ptr->offset; } else { if (ptr->offset == 0) { // chain leading block budgets[block_index].push_front(std::make_pair(ptr->offset, ptr->capacity)); } else { budgets[block_index].push_back(std::make_pair(ptr->offset, ptr->capacity)); } } delete ptr; } VkWeightBufferAllocator::VkWeightBufferAllocator(const VulkanDevice* _vkdev) : VkAllocator(_vkdev) { buffer_offset_alignment = vkdev->info.buffer_offset_alignment; if (vkdev->info.type == 1) { // on integrated gpu, there may be device local only memory too, eg. AMD APU // assuming larger alignment always keeps us safe :) // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.memory_map_alignment); buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.non_coherent_atom_size); } block_size = alignSize(8 * 1024 * 1024, buffer_offset_alignment);// 8M } VkWeightBufferAllocator::~VkWeightBufferAllocator() { clear(); } void VkWeightBufferAllocator::clear() { // fprintf(stderr, "VkWeightBufferAllocator %lu %lu\n", buffer_blocks.size(), dedicated_buffer_blocks.size()); buffer_block_free_spaces.clear(); for (size_t i=0; ivkdevice(), ptr->memory); vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); delete ptr; } buffer_blocks.clear(); for (size_t i=0; ivkdevice(), ptr->memory); vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); delete ptr; } dedicated_buffer_blocks.clear(); } VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size) { // fprintf(stderr, "VkWeightBufferAllocator fastMalloc %lu\n", size); size_t aligned_size = alignSize(size, buffer_offset_alignment); const int buffer_block_count = buffer_blocks.size(); // find first spare space in buffer_blocks int block_index = -1; size_t block_offset = 0; for (int i=0; i= aligned_size) { block_index = i; block_offset = block_size - free_size; break; } } if (block_index != -1) { // return sub buffer VkBufferMemory* ptr = new VkBufferMemory; ptr->buffer = buffer_blocks[block_index]->buffer; ptr->offset = block_offset; ptr->memory = buffer_blocks[block_index]->memory; ptr->capacity = aligned_size; ptr->mapped_ptr = buffer_blocks[block_index]->mapped_ptr; ptr->state = 1; buffer_block_free_spaces[block_index] -= aligned_size; return ptr; } size_t new_block_size = std::max(block_size, aligned_size); // create new block VkBufferMemory* block = new VkBufferMemory; block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT); block->offset = 0; if (vkdev->info.support_VK_KHR_get_memory_requirements2 && vkdev->info.support_VK_KHR_dedicated_allocation) { VkBufferMemoryRequirementsInfo2KHR bufferMemoryRequirementsInfo2; bufferMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR; bufferMemoryRequirementsInfo2.pNext = 0; bufferMemoryRequirementsInfo2.buffer = block->buffer; VkMemoryRequirements2KHR memoryRequirements2; memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR; memoryRequirements2.pNext = 0; VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements; memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR; memoryDedicatedRequirements.pNext = 0; memoryRequirements2.pNext = &memoryDedicatedRequirements; vkdev->vkGetBufferMemoryRequirements2KHR(vkdev->vkdevice(), &bufferMemoryRequirementsInfo2, &memoryRequirements2); bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation; if (dedicatedAllocation) { // setup memory type and alignment if (memory_type_index == (uint32_t)-1) { if (vkdev->info.type == 1) { // integrated gpu, prefer unified memory memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); } else { // discrete gpu, device local memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); } mappable = vkdev->is_mappable(memory_type_index); coherent = vkdev->is_coherent(memory_type_index); } block->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, block->buffer); vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0); block->mapped_ptr = 0; if (mappable) { vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr); } dedicated_buffer_blocks.push_back(block); // return sub buffer VkBufferMemory* ptr = new VkBufferMemory; ptr->buffer = block->buffer; ptr->offset = 0; ptr->memory = block->memory; ptr->capacity = new_block_size; ptr->mapped_ptr = block->mapped_ptr; ptr->state = 1; return ptr; } } VkMemoryRequirements memoryRequirements; vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements); // setup memory type and alignment if (memory_type_index == (uint32_t)-1) { if (vkdev->info.type == 1) { // integrated gpu, prefer unified memory memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); } else { // discrete gpu, device local memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); } mappable = vkdev->is_mappable(memory_type_index); coherent = vkdev->is_coherent(memory_type_index); } block->memory = allocate_memory(memoryRequirements.size); vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0); // fprintf(stderr, "VkWeightBufferAllocator M %p\n", block->buffer); block->mapped_ptr = 0; if (mappable) { vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr); } buffer_blocks.push_back(block); buffer_block_free_spaces.push_back(new_block_size - aligned_size); // return sub buffer VkBufferMemory* ptr = new VkBufferMemory; ptr->buffer = block->buffer; ptr->offset = 0; ptr->memory = block->memory; ptr->capacity = aligned_size; ptr->mapped_ptr = block->mapped_ptr; ptr->state = 1; return ptr; } void VkWeightBufferAllocator::fastFree(VkBufferMemory* ptr) { // fprintf(stderr, "VkWeightBufferAllocator F %p\n", ptr->buffer); delete ptr; } VkStagingBufferAllocator::VkStagingBufferAllocator(const VulkanDevice* _vkdev) : VkAllocator(_vkdev) { mappable = true; coherent = true; size_compare_ratio = 192;// 0.75f * 256 } VkStagingBufferAllocator::~VkStagingBufferAllocator() { clear(); } void VkStagingBufferAllocator::set_size_compare_ratio(float scr) { if (scr < 0.f || scr > 1.f) { fprintf(stderr, "invalid size compare ratio %f\n", scr); return; } size_compare_ratio = (unsigned int)(scr * 256); } void VkStagingBufferAllocator::clear() { // fprintf(stderr, "VkStagingBufferAllocator %lu\n", budgets.size()); std::list::iterator it = budgets.begin(); for (; it != budgets.end(); it++) { VkBufferMemory* ptr = *it; // fprintf(stderr, "VkStagingBufferAllocator F %p\n", ptr->buffer); vkUnmapMemory(vkdev->vkdevice(), ptr->memory); vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); delete ptr; } budgets.clear(); } VkBufferMemory* VkStagingBufferAllocator::fastMalloc(size_t size) { // find free budget std::list::iterator it = budgets.begin(); for (; it != budgets.end(); it++) { VkBufferMemory* ptr = *it; size_t capacity = ptr->capacity; // size_compare_ratio ~ 100% if (capacity >= size && ((capacity * size_compare_ratio) >> 8) <= size) { budgets.erase(it); // fprintf(stderr, "VkStagingBufferAllocator M %p %lu reused %lu\n", ptr->buffer, size, capacity); return ptr; } } VkBufferMemory* ptr = new VkBufferMemory; ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT); ptr->offset = 0; VkMemoryRequirements memoryRequirements; vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements); // setup memory type if (memory_type_index == (uint32_t)-1) { memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); } ptr->memory = allocate_memory(memoryRequirements.size); vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0); ptr->capacity = size; vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr); ptr->state = 1; // fprintf(stderr, "VkStagingBufferAllocator M %p %lu\n", ptr->buffer, size); return ptr; } void VkStagingBufferAllocator::fastFree(VkBufferMemory* ptr) { // fprintf(stderr, "VkStagingBufferAllocator F %p\n", ptr->buffer); // return to budgets budgets.push_back(ptr); } VkWeightStagingBufferAllocator::VkWeightStagingBufferAllocator(const VulkanDevice* _vkdev) : VkAllocator(_vkdev) { mappable = true; coherent = true; } VkWeightStagingBufferAllocator::~VkWeightStagingBufferAllocator() { } VkBufferMemory* VkWeightStagingBufferAllocator::fastMalloc(size_t size) { VkBufferMemory* ptr = new VkBufferMemory; ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT); ptr->offset = 0; VkMemoryRequirements memoryRequirements; vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements); // setup memory type if (memory_type_index == (uint32_t)-1) { memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); } ptr->memory = allocate_memory(memoryRequirements.size); vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0); ptr->capacity = size; vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr); ptr->state = 1; // fprintf(stderr, "VkWeightStagingBufferAllocator M %p %lu\n", ptr->buffer, size); return ptr; } void VkWeightStagingBufferAllocator::fastFree(VkBufferMemory* ptr) { // fprintf(stderr, "VkWeightStagingBufferAllocator F %p\n", ptr->buffer); vkUnmapMemory(vkdev->vkdevice(), ptr->memory); vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); delete ptr; } VkImageAllocator::VkImageAllocator(const VulkanDevice* _vkdev) : VkAllocator(_vkdev) { memory_type_index = (uint32_t)-1; } VkImage VkImageAllocator::create_image(int width, int height, VkFormat format, VkImageUsageFlags usage) { VkImageCreateInfo imageCreateInfo; imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, imageCreateInfo.pNext = 0; imageCreateInfo.flags = 0; imageCreateInfo.imageType = VK_IMAGE_TYPE_2D; imageCreateInfo.format = format; imageCreateInfo.extent.width = width; imageCreateInfo.extent.height = height; imageCreateInfo.extent.depth = 1; imageCreateInfo.mipLevels = 1; imageCreateInfo.arrayLayers = 1; imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT; imageCreateInfo.tiling = VK_IMAGE_TILING_OPTIMAL; imageCreateInfo.usage = usage; imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; imageCreateInfo.queueFamilyIndexCount = 0; imageCreateInfo.pQueueFamilyIndices = 0; imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; VkImage image; VkResult ret = vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image); if (ret != VK_SUCCESS) { fprintf(stderr, "vkCreateImage failed %d\n", ret); return 0; } return image; } VkImageView VkImageAllocator::create_imageview(VkImage image, VkFormat format) { VkImageViewCreateInfo imageViewCreateInfo; imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; imageViewCreateInfo.pNext = 0; imageViewCreateInfo.flags = 0; imageViewCreateInfo.image = image; imageViewCreateInfo.viewType = VK_IMAGE_VIEW_TYPE_2D; imageViewCreateInfo.format = format; imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY; imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY; imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY; imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY; imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; imageViewCreateInfo.subresourceRange.baseMipLevel = 0; imageViewCreateInfo.subresourceRange.levelCount = 1; imageViewCreateInfo.subresourceRange.baseArrayLayer = 0; imageViewCreateInfo.subresourceRange.layerCount = 1; VkImageView imageview; VkResult ret = vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview); if (ret != VK_SUCCESS) { fprintf(stderr, "vkCreateImageView failed %d\n", ret); return 0; } return imageview; } VkDeviceMemory VkImageAllocator::allocate_dedicated_memory(size_t size, VkImage image) { VkMemoryAllocateInfo memoryAllocateInfo; memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; memoryAllocateInfo.pNext = 0; memoryAllocateInfo.allocationSize = size; memoryAllocateInfo.memoryTypeIndex = memory_type_index; VkMemoryDedicatedAllocateInfoKHR memoryDedicatedAllocateInfo; memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR; memoryDedicatedAllocateInfo.pNext = 0; memoryDedicatedAllocateInfo.image = image; memoryDedicatedAllocateInfo.buffer = 0; memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo; VkDeviceMemory memory = 0; VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory); if (ret != VK_SUCCESS) { fprintf(stderr, "vkAllocateMemory failed %d\n", ret); } return memory; } VkSimpleImageAllocator::VkSimpleImageAllocator(const VulkanDevice* _vkdev) : VkImageAllocator(_vkdev) { } VkSimpleImageAllocator::~VkSimpleImageAllocator() { } VkImageMemory* VkSimpleImageAllocator::fastMalloc(int width, int height, VkFormat format) { VkImageMemory* ptr = new VkImageMemory; ptr->image = create_image(width, height, format, VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT); VkMemoryRequirements memoryRequirements; vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements); // setup memory type if (memory_type_index == (uint32_t)-1) { if (vkdev->info.type == 1) { // integrated gpu, prefer unified memory memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, 0); } else { // discrete gpu, device local memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); } } ptr->memory = allocate_memory(memoryRequirements.size); vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, 0); ptr->imageview = create_imageview(ptr->image, format); ptr->state = 1; return ptr; } void VkSimpleImageAllocator::fastFree(VkImageMemory* ptr) { vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0); vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); delete ptr; } #if __ANDROID_API__ >= 26 VkAndroidHardwareBufferImageAllocator::VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, const ImportAndroidHardwareBufferPipeline* p) : VkImageAllocator(_vkdev), q(p) { } VkAndroidHardwareBufferImageAllocator::~VkAndroidHardwareBufferImageAllocator() { } VkImageMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(AHardwareBuffer* hb) { VkResult ret; AHardwareBuffer_Desc bufferDesc; AHardwareBuffer_describe(hb, &bufferDesc); VkAndroidHardwareBufferFormatPropertiesANDROID bufferFormatProperties; bufferFormatProperties.sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_ANDROID; bufferFormatProperties.pNext = 0; VkAndroidHardwareBufferPropertiesANDROID bufferProperties; bufferProperties.sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_PROPERTIES_ANDROID; bufferProperties.pNext = &bufferFormatProperties; ret = vkGetAndroidHardwareBufferPropertiesANDROID(vkdev->vkdevice(), hb, &bufferProperties); if (ret != VK_SUCCESS) { fprintf(stderr, "vkGetAndroidHardwareBufferPropertiesANDROID failed %d\n", ret); return 0; } VkExternalFormatANDROID externalFormat; externalFormat.sType = VK_STRUCTURE_TYPE_EXTERNAL_FORMAT_ANDROID; externalFormat.pNext = 0; externalFormat.externalFormat = bufferFormatProperties.externalFormat; VkExternalMemoryImageCreateInfo externalMemoryImageCreateInfo; externalMemoryImageCreateInfo.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO, externalMemoryImageCreateInfo.pNext = &externalFormat, externalMemoryImageCreateInfo.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID; VkImageCreateInfo imageCreateInfo; imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, imageCreateInfo.pNext = &externalMemoryImageCreateInfo; imageCreateInfo.flags = 0; imageCreateInfo.imageType = VK_IMAGE_TYPE_2D; imageCreateInfo.format = VK_FORMAT_UNDEFINED; imageCreateInfo.extent.width = bufferDesc.width; imageCreateInfo.extent.height = bufferDesc.height; imageCreateInfo.extent.depth = 1; imageCreateInfo.mipLevels = 1; imageCreateInfo.arrayLayers = 1; imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT; imageCreateInfo.tiling = VK_IMAGE_TILING_OPTIMAL; imageCreateInfo.usage = VK_IMAGE_USAGE_SAMPLED_BIT; imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; imageCreateInfo.queueFamilyIndexCount = 0; imageCreateInfo.pQueueFamilyIndices = 0; imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; VkImage image = 0; ret = vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image); if (ret != VK_SUCCESS) { fprintf(stderr, "vkCreateImage failed %d\n", ret); return 0; } // setup memory type if (memory_type_index == (uint32_t)-1) { memory_type_index = vkdev->find_memory_index(bufferProperties.memoryTypeBits, 0, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); } VkImportAndroidHardwareBufferInfoANDROID importAndroidHardwareBufferInfo; importAndroidHardwareBufferInfo.sType = VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID; importAndroidHardwareBufferInfo.pNext = 0; importAndroidHardwareBufferInfo.buffer = hb; VkMemoryDedicatedAllocateInfo memoryDedicatedAllocateInfo; memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO; memoryDedicatedAllocateInfo.pNext = &importAndroidHardwareBufferInfo; memoryDedicatedAllocateInfo.image = image; memoryDedicatedAllocateInfo.buffer = VK_NULL_HANDLE; VkMemoryAllocateInfo memoryAllocateInfo; memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo; memoryAllocateInfo.allocationSize = bufferProperties.allocationSize; memoryAllocateInfo.memoryTypeIndex = memory_type_index; VkDeviceMemory memory = 0; ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory); if (ret != VK_SUCCESS) { fprintf(stderr, "vkAllocateMemory failed %d\n", ret); return 0; } VkBindImageMemoryInfo bindImageMemoryInfo; bindImageMemoryInfo.sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO; bindImageMemoryInfo.pNext = 0; bindImageMemoryInfo.image = image; bindImageMemoryInfo.memory = memory; bindImageMemoryInfo.memoryOffset = 0; ret = vkdev->vkBindImageMemory2KHR(vkdev->vkdevice(), 1, &bindImageMemoryInfo); if (ret != VK_SUCCESS) { fprintf(stderr, "vkBindImageMemory2KHR failed %d\n", ret); vkDestroyImage(vkdev->vkdevice(), image, 0); return 0; } VkSamplerYcbcrConversionInfoKHR samplerYcbcrConversionInfo; samplerYcbcrConversionInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO_KHR; samplerYcbcrConversionInfo.pNext = &externalFormat; samplerYcbcrConversionInfo.conversion = q->samplerYcbcrConversion; VkImageViewCreateInfo imageViewCreateInfo; imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; imageViewCreateInfo.pNext = &samplerYcbcrConversionInfo; imageViewCreateInfo.flags = 0; imageViewCreateInfo.image = image; imageViewCreateInfo.viewType = VK_IMAGE_VIEW_TYPE_2D; imageViewCreateInfo.format = VK_FORMAT_UNDEFINED; imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY; imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY; imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY; imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY; imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; imageViewCreateInfo.subresourceRange.baseMipLevel = 0; imageViewCreateInfo.subresourceRange.levelCount = 1; imageViewCreateInfo.subresourceRange.baseArrayLayer = 0; imageViewCreateInfo.subresourceRange.layerCount = 1; VkImageView imageview = 0; ret = vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview); if (ret != VK_SUCCESS) { fprintf(stderr, "vkCreateImageView failed %d\n", ret); vkDestroyImage(vkdev->vkdevice(), image, 0); vkFreeMemory(vkdev->vkdevice(), memory, 0); return 0; } VkImageMemory* ptr = new VkImageMemory; ptr->image = image; ptr->memory = memory; ptr->imageview = imageview; ptr->state = 1; return ptr; } void VkAndroidHardwareBufferImageAllocator::fastFree(VkImageMemory* ptr) { vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0); vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); delete ptr; } #endif // __ANDROID_API__ >= 26 #endif // NCNN_VULKAN } // namespace ncnn