Browse Source

prefer faster and larger device local only memory on amd integrated graphics, heap budget value follows the same strategy as blob allocator (#4936)

tags/20230816
nihui GitHub 2 years ago
parent
commit
e80fcbca8f
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 64 additions and 14 deletions
  1. +60
    -0
      src/allocator.cpp
  2. +4
    -14
      src/gpu.cpp

+ 60
- 0
src/allocator.cpp View File

@@ -738,6 +738,16 @@ VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size)
{
// integrated gpu, prefer unified memory
buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

// on amd integrated gpu, there is a faster and larger device-only heap
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
{
buffer_memory_type_index = device_local_memory_type_index;
}
}
else
{
@@ -990,6 +1000,16 @@ VkImageMemory* VkBlobAllocator::fastMalloc(int w, int h, int c, size_t elemsize,
{
// integrated gpu, prefer unified memory
image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

// on amd integrated gpu, there is a faster and larger device-only heap
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
{
image_memory_type_index = device_local_memory_type_index;
}
}
else
{
@@ -1299,6 +1319,16 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
{
// integrated gpu, prefer unified memory
buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

// on amd integrated gpu, there is a faster and larger device-only heap
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
{
buffer_memory_type_index = device_local_memory_type_index;
}
}
else
{
@@ -1348,6 +1378,16 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
{
// integrated gpu, prefer unified memory
buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

// on amd integrated gpu, there is a faster and larger device-only heap
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
{
buffer_memory_type_index = device_local_memory_type_index;
}
}
else
{
@@ -1484,6 +1524,16 @@ VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsiz
{
// integrated gpu, prefer unified memory
image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

// on amd integrated gpu, there is a faster and larger device-only heap
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
{
image_memory_type_index = device_local_memory_type_index;
}
}
else
{
@@ -1578,6 +1628,16 @@ VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsiz
{
// integrated gpu, prefer unified memory
image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

// on amd integrated gpu, there is a faster and larger device-only heap
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
{
image_memory_type_index = device_local_memory_type_index;
}
}
else
{


+ 4
- 14
src/gpu.cpp View File

@@ -3153,23 +3153,13 @@ uint32_t VulkanDevice::get_heap_budget() const
{
const VkPhysicalDeviceMemoryProperties& memory_properties = info.physical_device_memory_properties();

// the first device local heap
uint32_t device_local_heap_index = 0;
uint32_t device_local_heap_size = 0;
for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
{
const VkMemoryHeap& memoryHeap = memory_properties.memoryHeaps[i];
if (memoryHeap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT)
{
device_local_heap_index = i;
device_local_heap_size = memoryHeap.size / 1024 / 1024;
break;
}
}
uint32_t buffer_memory_type_index = d->dummy_allocator->buffer_memory_type_index;
uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;

if (!info.support_VK_EXT_memory_budget())
{
// NCNN_LOGE("heap budget from assumption\n");
uint32_t device_local_heap_size = memory_properties.memoryHeaps[buffer_heap_index].size / 1024 / 1024;

// we usually cannot use all heap
// 70% for 4G+
@@ -3187,7 +3177,7 @@ uint32_t VulkanDevice::get_heap_budget() const

vkGetPhysicalDeviceMemoryProperties2KHR(info.physical_device(), &memoryProperties);

return memoryBudgetProperties.heapBudget[device_local_heap_index] / 1024 / 1024;
return memoryBudgetProperties.heapBudget[buffer_heap_index] / 1024 / 1024;
}

void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& _opt) const


Loading…
Cancel
Save