diff --git a/src/allocator.cpp b/src/allocator.cpp index 1a6252cb4..6282c4852 100644 --- a/src/allocator.cpp +++ b/src/allocator.cpp @@ -738,6 +738,16 @@ VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size) { // integrated gpu, prefer unified memory buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + + // on amd integrated gpu, there is a faster and larger device-only heap + uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties(); + uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex; + uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex; + if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size) + { + buffer_memory_type_index = device_local_memory_type_index; + } } else { @@ -990,6 +1000,16 @@ VkImageMemory* VkBlobAllocator::fastMalloc(int w, int h, int c, size_t elemsize, { // integrated gpu, prefer unified memory image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + + // on amd integrated gpu, there is a faster and larger device-only heap + uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties(); + uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex; + uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex; + if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size) + { + image_memory_type_index = device_local_memory_type_index; + } } else { @@ -1299,6 +1319,16 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size) { // integrated gpu, prefer unified memory buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + + // on amd integrated gpu, there is a faster and larger device-only heap + uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties(); + uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex; + uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex; + if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size) + { + buffer_memory_type_index = device_local_memory_type_index; + } } else { @@ -1348,6 +1378,16 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size) { // integrated gpu, prefer unified memory buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + + // on amd integrated gpu, there is a faster and larger device-only heap + uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties(); + uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex; + uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex; + if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size) + { + buffer_memory_type_index = device_local_memory_type_index; + } } else { @@ -1484,6 +1524,16 @@ VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsiz { // integrated gpu, prefer unified memory image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + + // on amd integrated gpu, there is a faster and larger device-only heap + uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties(); + uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex; + uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex; + if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size) + { + image_memory_type_index = device_local_memory_type_index; + } } else { @@ -1578,6 +1628,16 @@ VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsiz { // integrated gpu, prefer unified memory image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + + // on amd integrated gpu, there is a faster and larger device-only heap + uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties(); + uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex; + uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex; + if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size) + { + image_memory_type_index = device_local_memory_type_index; + } } else { diff --git a/src/gpu.cpp b/src/gpu.cpp index 88c44d53f..f32f6e20a 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -3153,23 +3153,13 @@ uint32_t VulkanDevice::get_heap_budget() const { const VkPhysicalDeviceMemoryProperties& memory_properties = info.physical_device_memory_properties(); - // the first device local heap - uint32_t device_local_heap_index = 0; - uint32_t device_local_heap_size = 0; - for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++) - { - const VkMemoryHeap& memoryHeap = memory_properties.memoryHeaps[i]; - if (memoryHeap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) - { - device_local_heap_index = i; - device_local_heap_size = memoryHeap.size / 1024 / 1024; - break; - } - } + uint32_t buffer_memory_type_index = d->dummy_allocator->buffer_memory_type_index; + uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex; if (!info.support_VK_EXT_memory_budget()) { // NCNN_LOGE("heap budget from assumption\n"); + uint32_t device_local_heap_size = memory_properties.memoryHeaps[buffer_heap_index].size / 1024 / 1024; // we usually cannot use all heap // 70% for 4G+ @@ -3187,7 +3177,7 @@ uint32_t VulkanDevice::get_heap_budget() const vkGetPhysicalDeviceMemoryProperties2KHR(info.physical_device(), &memoryProperties); - return memoryBudgetProperties.heapBudget[device_local_heap_index] / 1024 / 1024; + return memoryBudgetProperties.heapBudget[buffer_heap_index] / 1024 / 1024; } void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& _opt) const