Browse Source

Merge remote-tracking branch 'origin/shader_cache2' into shader_cache2

pull/6221/head
ice 10 months ago
parent
commit
cab459a0d9
3 changed files with 60 additions and 62 deletions
  1. +57
    -59
      src/gpu.cpp
  2. +2
    -2
      src/pipelinecache.cpp
  3. +1
    -1
      src/pipelinecache.h

+ 57
- 59
src/gpu.cpp View File

@@ -477,16 +477,16 @@ void GpuInfoPrivate::query_properties()
}

if (physicalDeviceProperties.vendorID == 0x13b5
&& (physicalDeviceProperties.deviceID == 0x7500001
|| physicalDeviceProperties.deviceID == 0x7501000
|| physicalDeviceProperties.deviceID == 0x8602000
|| physicalDeviceProperties.deviceID == 0x8800020
|| physicalDeviceProperties.deviceID == 0x70930000
|| physicalDeviceProperties.deviceID == 0x70901010
|| physicalDeviceProperties.deviceID == 0x72120000
|| physicalDeviceProperties.deviceID == 0x74021000
|| physicalDeviceProperties.deviceID == 0x60a00002
|| physicalDeviceProperties.deviceID == 0x62210001))
&& (physicalDeviceProperties.deviceID == 0x7500001
|| physicalDeviceProperties.deviceID == 0x7501000
|| physicalDeviceProperties.deviceID == 0x8602000
|| physicalDeviceProperties.deviceID == 0x8800020
|| physicalDeviceProperties.deviceID == 0x70930000
|| physicalDeviceProperties.deviceID == 0x70901010
|| physicalDeviceProperties.deviceID == 0x72120000
|| physicalDeviceProperties.deviceID == 0x74021000
|| physicalDeviceProperties.deviceID == 0x60a00002
|| physicalDeviceProperties.deviceID == 0x62210001))
{
// NOTE rk3288/rk3399/t880/g31/g51/g52/g71/g72
// however, g76/g77 has explicit fp16 arithmetic
@@ -495,9 +495,9 @@ void GpuInfoPrivate::query_properties()
}

if (physicalDeviceProperties.vendorID == 0x5143
&& (physicalDeviceProperties.deviceID == 0x6030001
|| physicalDeviceProperties.deviceID == 0x6040001
|| physicalDeviceProperties.deviceID == 0x6050002))
&& (physicalDeviceProperties.deviceID == 0x6030001
|| physicalDeviceProperties.deviceID == 0x6040001
|| physicalDeviceProperties.deviceID == 0x6050002))
{
// TODO enable devices other than qcom845/qcom855/qcom855plus/qcom865
// qcom adreno driver accept spirv with fp16 arithmetic
@@ -513,7 +513,7 @@ static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyPropert
const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];

if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
&& !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
&& !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
{
return i;
}
@@ -525,7 +525,7 @@ static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyPropert
const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];

if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
&& (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
&& (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
{
return i;
}
@@ -554,8 +554,8 @@ static uint32_t find_device_transfer_queue(const std::vector<VkQueueFamilyProper
const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];

if ((queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT)
&& !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
&& !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
&& !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
&& !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
{
return i;
}
@@ -1138,30 +1138,30 @@ void GpuInfoPrivate::query_extension_properties()
// NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope);

if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
{
support_cooperative_matrix_8_8_16 = true;
}
if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
{
support_cooperative_matrix_16_8_8 = true;
}
if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
{
support_cooperative_matrix_16_8_16 = true;
}
if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
{
support_cooperative_matrix_16_16_16 = true;
}
@@ -1195,30 +1195,30 @@ void GpuInfoPrivate::query_extension_properties()
// NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope);

if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
{
support_cooperative_matrix_8_8_16 = true;
}
if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
{
support_cooperative_matrix_16_8_8 = true;
}
if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
{
support_cooperative_matrix_16_8_16 = true;
}
if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
{
support_cooperative_matrix_16_16_16 = true;
}
@@ -2032,8 +2032,8 @@ void GpuInfo::get_optimal_cooperative_matrix_mnk(int M, int N, int K, VkComponen
const VkCooperativeMatrixPropertiesKHR& cmp = d->queryCooperativeMatrixSubProperties[i];

if (cmp.AType == type && cmp.BType == type
&& cmp.CType == acctype && cmp.ResultType == acctype
&& cmp.scope == scope)
&& cmp.CType == acctype && cmp.ResultType == acctype
&& cmp.scope == scope)
{
mnk_properties.push_back(cmp);
}
@@ -2046,8 +2046,8 @@ void GpuInfo::get_optimal_cooperative_matrix_mnk(int M, int N, int K, VkComponen
const VkCooperativeMatrixPropertiesNV& cmp = d->queryCooperativeMatrixSubPropertiesNV[i];

if (cmp.AType == (VkComponentTypeNV)type && cmp.BType == (VkComponentTypeNV)type
&& cmp.CType == (VkComponentTypeNV)acctype && cmp.DType == (VkComponentTypeNV)acctype
&& cmp.scope == (VkScopeNV)scope)
&& cmp.CType == (VkComponentTypeNV)acctype && cmp.DType == (VkComponentTypeNV)acctype
&& cmp.scope == (VkScopeNV)scope)
{
VkCooperativeMatrixPropertiesKHR cmp_khr;
cmp_khr.MSize = cmp.MSize;
@@ -2460,7 +2460,7 @@ int create_gpu_instance(const char* driver_path)
#endif // __ANDROID_API__ >= 26

uint32_t instance_api_version = VK_MAKE_VERSION(1, 0, 0);
typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t* pApiVersion);
typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t * pApiVersion);
PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion = (PFN_vkEnumerateInstanceVersion)vkGetInstanceProcAddr(0, "vkEnumerateInstanceVersion");
if (vkEnumerateInstanceVersion)
{
@@ -2673,7 +2673,7 @@ int create_gpu_instance(const char* driver_path)
fp16_matrix_properties.push_back(cmp);
}
if ((cmp.AType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.AType == VK_COMPONENT_TYPE_SINT8_PACKED_NV)
&& (cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.BType == VK_COMPONENT_TYPE_SINT8_PACKED_NV))
&& (cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.BType == VK_COMPONENT_TYPE_SINT8_PACKED_NV))
{
bool mnk_hit = false;
for (size_t k = 0; k < int8_matrix_properties.size(); k++)
@@ -2704,9 +2704,9 @@ int create_gpu_instance(const char* driver_path)
bf16_matrix_properties.push_back(cmp);
}
if ((cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT
|| cmp.AType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV)
&& (cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT
|| cmp.BType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV))
|| cmp.AType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV)
&& (cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT
|| cmp.BType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV))
{
bool mnk_hit = false;
for (size_t k = 0; k < fp8_matrix_properties.size(); k++)
@@ -3144,9 +3144,8 @@ const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_
uop->vkdev = vkdev;

ncnn::ParamDict pd;
pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4
: 8); // out_elempack
pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16 3=int8
pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 : 8); // out_elempack
pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16 3=int8
pd.set(3, cast_type_to_index + 1);

uop->load_param(pd);
@@ -3897,8 +3896,8 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ
{
const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
if ((memoryType.propertyFlags & required) == required
&& (preferred && (memoryType.propertyFlags & preferred))
&& (preferred_not && !(memoryType.propertyFlags & preferred_not)))
&& (preferred && (memoryType.propertyFlags & preferred))
&& (preferred_not && !(memoryType.propertyFlags & preferred_not)))
{
return i;
}
@@ -3913,7 +3912,7 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ
{
const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
if ((memoryType.propertyFlags & required) == required
&& (preferred && (memoryType.propertyFlags & preferred)))
&& (preferred && (memoryType.propertyFlags & preferred)))
{
return i;
}
@@ -3928,7 +3927,7 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ
{
const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
if ((memoryType.propertyFlags & required) == required
&& (preferred_not && !(memoryType.propertyFlags & preferred_not)))
&& (preferred_not && !(memoryType.propertyFlags & preferred_not)))
{
return i;
}
@@ -4236,8 +4235,7 @@ void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempac

void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, int cast_type_to, VkCompute& cmd, const Option& opt) const
{
int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1
: 2;
int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;

int cast_type_from_index;
if (src.elembits() == 32)


+ 2
- 2
src/pipelinecache.cpp View File

@@ -145,7 +145,7 @@ public:
};

PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
{
spv_data_murmur3 = murmur3_32(spv_data, spv_data_size / 4);

@@ -163,7 +163,7 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_
}

PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(int _shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
{
shader_type_index = _shader_type_index;



+ 1
- 1
src/pipelinecache.h View File

@@ -42,7 +42,7 @@ public:
VkDescriptorUpdateTemplateKHR* descriptor_update_template,
ShaderInfo& shader_info) const;

int save_cache(std::vector<unsigned char> &buf) const;
int save_cache(std::vector<unsigned char>& buf) const;
int load_cache(const std::vector<unsigned char>& buf) const;

#ifdef NCNN_STDIO


Loading…
Cancel
Save