Browse Source

workaround local workgroup size specialization constant bug for old arm mali vulkan driver, fix #1424

tags/20200106
nihui 6 years ago
parent
commit
8a87f0267a
3 changed files with 174 additions and 6 deletions
  1. +149
    -5
      src/gpu.cpp
  2. +9
    -0
      src/gpu.h
  3. +16
    -1
      src/pipeline.cpp

+ 149
- 5
src/gpu.cpp View File

@@ -520,11 +520,12 @@ int create_gpu_instance()
// fprintf(stderr, "[%u] deviceName = %s\n", i, physicalDeviceProperties.deviceName);
// fprintf(stderr, "[%u] pipelineCacheUUID = %u\n", i, physicalDeviceProperties.pipelineCacheUUID);

gpu_info.bug_local_size_spec_const = false;

if (physicalDeviceProperties.vendorID == 0x13b5 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 0, 66))
{
// ignore arm mali with old buggy driver
fprintf(stderr, "arm mali driver is too old\n");
continue;
// arm mali with old buggy driver
gpu_info.bug_local_size_spec_const = true;
}

if (physicalDeviceProperties.vendorID == 0x5143 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 0, 49))
@@ -762,9 +763,10 @@ int create_gpu_instance()
gpu_info.support_fp16_storage = false;
}

fprintf(stderr, "[%u %s] queueC=%u[%u] queueT=%u[%u]\n", i, physicalDeviceProperties.deviceName,
fprintf(stderr, "[%u %s] queueC=%u[%u] queueT=%u[%u] buglssc=%d\n", i, physicalDeviceProperties.deviceName,
gpu_info.compute_queue_family_index, gpu_info.compute_queue_count,
gpu_info.transfer_queue_family_index, gpu_info.transfer_queue_count);
gpu_info.transfer_queue_family_index, gpu_info.transfer_queue_count,
gpu_info.bug_local_size_spec_const);

fprintf(stderr, "[%u %s] fp16p=%d fp16s=%d fp16a=%d int8s=%d int8a=%d\n", i, physicalDeviceProperties.deviceName,
gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
@@ -1049,6 +1051,32 @@ VkShaderModule VulkanDevice::get_shader_module(const char* name) const
return 0;
}

VkShaderModule VulkanDevice::create_shader_module(const char* name, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const
{
const uint32_t* spv_data = 0;
size_t spv_data_size = 0;

for (int i=0; i<layer_shader_registry_entry_count; i++)
{
const char* shader_name = layer_shader_registry[i].name;

if (strcmp(shader_name, name) == 0)
{
spv_data = layer_shader_registry[i].spv_data;
spv_data_size = layer_shader_registry[i].spv_data_size;
break;
}
}

if (!spv_data)
{
fprintf(stderr, "no such shader module %s\n", name);
return 0;
}

return compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z);
}

VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const
{
VkShaderModuleCreateInfo shaderModuleCreateInfo;
@@ -1069,6 +1097,116 @@ VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, siz
return shader_module;
}

static void inject_local_size_xyz(const uint32_t* code, size_t size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t* dstcode, size_t* dstsize)
{
uint32_t local_size_x_id = -1;
uint32_t local_size_y_id = -1;
uint32_t local_size_z_id = -1;
uint32_t gl_WorkGroupSize_id = -1;

const uint32_t* p = code;
uint32_t* dp = dstcode;

// skip magic version generator bound schema
memcpy(dp, p, 5 * sizeof(uint32_t));
p += 5;
dp += 5;

// foreach op
while ((const unsigned char*)p < (const unsigned char*)code + size)
{
uint32_t opcode = p[0];

uint16_t wordcount = opcode >> 16;
uint16_t op = opcode & 0xffff;

if (op == 16) // OpExecutionMode
{
uint32_t mode = p[2];
if (mode == 17) // LocalSize
{
memcpy(dp, p, wordcount * sizeof(uint32_t));

// set local_size_xyz
dp[3] = local_size_x;
dp[4] = local_size_y;
dp[5] = local_size_z;

p += wordcount;
dp += wordcount;
continue;
}
}
else if (op == 50) // OpSpecConstant
{
uint32_t id = p[2];
if (id == local_size_x_id || id == local_size_y_id || id == local_size_z_id)
{
p += wordcount;
continue;
}
}
else if (op == 51) // OpSpecConstantComposite
{
uint32_t id = p[2];
if (id == gl_WorkGroupSize_id)
{
if (wordcount == 6 && (p[3] == local_size_x_id || p[4] == local_size_y_id || p[5] == local_size_z_id))
{
p += wordcount;
continue;
}
}
}
else if (op == 71) // OpDecorate
{
uint32_t id = p[1];
uint32_t decoration = p[2];
if (decoration == 1) // SpecId
{
uint32_t specid = p[3];
if (specid == 233) local_size_x_id = id;
if (specid == 234) local_size_y_id = id;
if (specid == 235) local_size_z_id = id;
if (specid == 233 || specid == 234 || specid == 235)
{
p += wordcount;
continue;
}
}
else if (decoration == 11) // BuiltIn
{
uint32_t builtin = p[3];
if (builtin == 25) // WorkgroupSize
{
gl_WorkGroupSize_id = id;
p += wordcount;
continue;
}
}
}

memcpy(dp, p, wordcount * sizeof(uint32_t));
p += wordcount;
dp += wordcount;
}

*dstsize = (unsigned char*)dp - (unsigned char*)dstcode;
}

VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const
{
uint32_t* spv_data_modified = (uint32_t*)malloc(spv_data_size);
size_t spv_data_size_modified = spv_data_size;
inject_local_size_xyz(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z, spv_data_modified, &spv_data_size_modified);

VkShaderModule shader_module = compile_shader_module(spv_data_modified, spv_data_size_modified);

free(spv_data_modified);

return shader_module;
}

uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const
{
// first try, find required and with preferred and without preferred_not
@@ -1293,6 +1431,12 @@ static inline bool string_ends_with_fp16a(const char* name)

int VulkanDevice::create_shader_module()
{
if (info.bug_local_size_spec_const)
{
// do not cache shader module
return 0;
}

shader_modules.resize(layer_shader_registry_entry_count, VK_NULL_HANDLE);

for (int i=0; i<layer_shader_registry_entry_count; i++)


+ 9
- 0
src/gpu.h View File

@@ -110,6 +110,9 @@ public:
uint32_t graphics_queue_count;
uint32_t transfer_queue_count;

// bug is not feature
bool bug_local_size_spec_const;

// fp16 and int8 feature
bool support_fp16_packed;
bool support_fp16_storage;
@@ -156,8 +159,14 @@ public:

VkShaderModule get_shader_module(const char* name) const;

// with fixed workgroup size
VkShaderModule create_shader_module(const char* name, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;

VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;

// with fixed workgroup size
VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;

uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
bool is_mappable(uint32_t memory_type_index) const;



+ 16
- 1
src/pipeline.cpp View File

@@ -48,7 +48,14 @@ Pipeline::~Pipeline()

int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const char* entry_name, const std::vector<vk_specialization_type>& specializations, int binding_count, int push_constant_count)
{
local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size);
if (vkdev->info.bug_local_size_spec_const)
{
local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z);
}
else
{
local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size);
}

// fprintf(stderr, "local_shader_module %p %s created\n", local_shader_module, entry_name);

@@ -88,6 +95,13 @@ int Pipeline::create(const char* _name, const Option& opt, const std::vector<vk_
name += "_fp16p";
}

if (vkdev->info.bug_local_size_spec_const)
{
local_shader_module = vkdev->create_shader_module(name.c_str(), local_size_x, local_size_y, local_size_z);

return create(local_shader_module, name.c_str(), specializations, binding_count, push_constant_count);
}

VkShaderModule shader_module = vkdev->get_shader_module(name.c_str());

return create(shader_module, name.c_str(), specializations, binding_count, push_constant_count);
@@ -324,6 +338,7 @@ int Pipeline::create_pipeline(VkShaderModule shader_module, const char* entry_na
std::vector<vk_specialization_type> specialization_data = specializations;

// append local_size_xyz specialization
if (!vkdev->info.bug_local_size_spec_const)
{
VkSpecializationMapEntry* local_size_xyz_entries = specializationMapEntries.data() + specialization_count;



Loading…
Cancel
Save