Browse Source

feat: pipe & spv cache

pull/6221/head
ice 9 months ago
parent
commit
7a0c19c856
6 changed files with 807 additions and 64 deletions
  1. +74
    -59
      src/gpu.cpp
  2. +2
    -1
      src/gpu.h
  3. +315
    -4
      src/pipelinecache.cpp
  4. +10
    -0
      src/pipelinecache.h
  5. +1
    -0
      tests/CMakeLists.txt
  6. +405
    -0
      tests/test_pipecache.cpp

+ 74
- 59
src/gpu.cpp View File

@@ -98,6 +98,7 @@ struct layer_shader_registry_entry

static const layer_shader_registry_entry layer_shader_registry[] = {
#include "layer_shader_registry.h"

};

static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry);
@@ -476,16 +477,16 @@ void GpuInfoPrivate::query_properties()
}

if (physicalDeviceProperties.vendorID == 0x13b5
&& (physicalDeviceProperties.deviceID == 0x7500001
|| physicalDeviceProperties.deviceID == 0x7501000
|| physicalDeviceProperties.deviceID == 0x8602000
|| physicalDeviceProperties.deviceID == 0x8800020
|| physicalDeviceProperties.deviceID == 0x70930000
|| physicalDeviceProperties.deviceID == 0x70901010
|| physicalDeviceProperties.deviceID == 0x72120000
|| physicalDeviceProperties.deviceID == 0x74021000
|| physicalDeviceProperties.deviceID == 0x60a00002
|| physicalDeviceProperties.deviceID == 0x62210001))
&& (physicalDeviceProperties.deviceID == 0x7500001
|| physicalDeviceProperties.deviceID == 0x7501000
|| physicalDeviceProperties.deviceID == 0x8602000
|| physicalDeviceProperties.deviceID == 0x8800020
|| physicalDeviceProperties.deviceID == 0x70930000
|| physicalDeviceProperties.deviceID == 0x70901010
|| physicalDeviceProperties.deviceID == 0x72120000
|| physicalDeviceProperties.deviceID == 0x74021000
|| physicalDeviceProperties.deviceID == 0x60a00002
|| physicalDeviceProperties.deviceID == 0x62210001))
{
// NOTE rk3288/rk3399/t880/g31/g51/g52/g71/g72
// however, g76/g77 has explicit fp16 arithmetic
@@ -494,9 +495,9 @@ void GpuInfoPrivate::query_properties()
}

if (physicalDeviceProperties.vendorID == 0x5143
&& (physicalDeviceProperties.deviceID == 0x6030001
|| physicalDeviceProperties.deviceID == 0x6040001
|| physicalDeviceProperties.deviceID == 0x6050002))
&& (physicalDeviceProperties.deviceID == 0x6030001
|| physicalDeviceProperties.deviceID == 0x6040001
|| physicalDeviceProperties.deviceID == 0x6050002))
{
// TODO enable devices other than qcom845/qcom855/qcom855plus/qcom865
// qcom adreno driver accept spirv with fp16 arithmetic
@@ -512,7 +513,7 @@ static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyPropert
const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];

if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
&& !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
&& !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
{
return i;
}
@@ -524,7 +525,7 @@ static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyPropert
const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];

if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
&& (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
&& (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
{
return i;
}
@@ -553,8 +554,8 @@ static uint32_t find_device_transfer_queue(const std::vector<VkQueueFamilyProper
const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];

if ((queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT)
&& !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
&& !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
&& !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
&& !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
{
return i;
}
@@ -1137,30 +1138,30 @@ void GpuInfoPrivate::query_extension_properties()
// NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope);

if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
{
support_cooperative_matrix_8_8_16 = true;
}
if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
{
support_cooperative_matrix_16_8_8 = true;
}
if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
{
support_cooperative_matrix_16_8_16 = true;
}
if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
&& cmp.scope == VK_SCOPE_SUBGROUP_KHR)
{
support_cooperative_matrix_16_16_16 = true;
}
@@ -1194,30 +1195,30 @@ void GpuInfoPrivate::query_extension_properties()
// NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope);

if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
{
support_cooperative_matrix_8_8_16 = true;
}
if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
{
support_cooperative_matrix_16_8_8 = true;
}
if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
{
support_cooperative_matrix_16_8_16 = true;
}
if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
&& cmp.scope == VK_SCOPE_SUBGROUP_NV)
{
support_cooperative_matrix_16_16_16 = true;
}
@@ -2031,8 +2032,8 @@ void GpuInfo::get_optimal_cooperative_matrix_mnk(int M, int N, int K, VkComponen
const VkCooperativeMatrixPropertiesKHR& cmp = d->queryCooperativeMatrixSubProperties[i];

if (cmp.AType == type && cmp.BType == type
&& cmp.CType == acctype && cmp.ResultType == acctype
&& cmp.scope == scope)
&& cmp.CType == acctype && cmp.ResultType == acctype
&& cmp.scope == scope)
{
mnk_properties.push_back(cmp);
}
@@ -2045,8 +2046,8 @@ void GpuInfo::get_optimal_cooperative_matrix_mnk(int M, int N, int K, VkComponen
const VkCooperativeMatrixPropertiesNV& cmp = d->queryCooperativeMatrixSubPropertiesNV[i];

if (cmp.AType == (VkComponentTypeNV)type && cmp.BType == (VkComponentTypeNV)type
&& cmp.CType == (VkComponentTypeNV)acctype && cmp.DType == (VkComponentTypeNV)acctype
&& cmp.scope == (VkScopeNV)scope)
&& cmp.CType == (VkComponentTypeNV)acctype && cmp.DType == (VkComponentTypeNV)acctype
&& cmp.scope == (VkScopeNV)scope)
{
VkCooperativeMatrixPropertiesKHR cmp_khr;
cmp_khr.MSize = cmp.MSize;
@@ -2459,7 +2460,7 @@ int create_gpu_instance(const char* driver_path)
#endif // __ANDROID_API__ >= 26

uint32_t instance_api_version = VK_MAKE_VERSION(1, 0, 0);
typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t * pApiVersion);
typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t* pApiVersion);
PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion = (PFN_vkEnumerateInstanceVersion)vkGetInstanceProcAddr(0, "vkEnumerateInstanceVersion");
if (vkEnumerateInstanceVersion)
{
@@ -2672,7 +2673,7 @@ int create_gpu_instance(const char* driver_path)
fp16_matrix_properties.push_back(cmp);
}
if ((cmp.AType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.AType == VK_COMPONENT_TYPE_SINT8_PACKED_NV)
&& (cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.BType == VK_COMPONENT_TYPE_SINT8_PACKED_NV))
&& (cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.BType == VK_COMPONENT_TYPE_SINT8_PACKED_NV))
{
bool mnk_hit = false;
for (size_t k = 0; k < int8_matrix_properties.size(); k++)
@@ -2703,9 +2704,9 @@ int create_gpu_instance(const char* driver_path)
bf16_matrix_properties.push_back(cmp);
}
if ((cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT
|| cmp.AType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV)
&& (cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT
|| cmp.BType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV))
|| cmp.AType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV)
&& (cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT
|| cmp.BType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV))
{
bool mnk_hit = false;
for (size_t k = 0; k < fp8_matrix_properties.size(); k++)
@@ -3143,8 +3144,9 @@ const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_
uop->vkdev = vkdev;

ncnn::ParamDict pd;
pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 : 8); // out_elempack
pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16 3=int8
pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4
: 8); // out_elempack
pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16 3=int8
pd.set(3, cast_type_to_index + 1);

uop->load_param(pd);
@@ -3734,7 +3736,7 @@ int VulkanDevice::create_pipeline_layout(int push_constant_count, VkDescriptorSe
return 0;
}

int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline) const
int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline, VkPipelineCache pipeline_cache) const
{
const int specialization_count = specializations.size();

@@ -3792,7 +3794,7 @@ int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout
computePipelineCreateInfo.basePipelineHandle = 0;
computePipelineCreateInfo.basePipelineIndex = 0;

VkResult ret = vkCreateComputePipelines(d->device, 0, 1, &computePipelineCreateInfo, 0, pipeline);
VkResult ret = vkCreateComputePipelines(d->device, pipeline_cache, 1, &computePipelineCreateInfo, 0, pipeline);
if (ret != VK_SUCCESS)
{
NCNN_LOGE("vkCreateComputePipelines failed %d", ret);
@@ -3871,6 +3873,18 @@ int VulkanDevice::create_descriptor_update_template(int binding_count, const int
return 0;
}

int VulkanDevice::create_pipeline_cache(const VkPipelineCacheCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkPipelineCache* pPipelineCache) const
{
VkResult ret = vkCreatePipelineCache(d->device, pCreateInfo, pAllocator, pPipelineCache);
if (ret != VK_SUCCESS)
{
NCNN_LOGE("vkCreatePipelineCache failed %d", ret);
return -1;
}

return 0;
}

uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const
{
const VkPhysicalDeviceMemoryProperties& memory_properties = info.physicalDeviceMemoryProperties();
@@ -3883,8 +3897,8 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ
{
const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
if ((memoryType.propertyFlags & required) == required
&& (preferred && (memoryType.propertyFlags & preferred))
&& (preferred_not && !(memoryType.propertyFlags & preferred_not)))
&& (preferred && (memoryType.propertyFlags & preferred))
&& (preferred_not && !(memoryType.propertyFlags & preferred_not)))
{
return i;
}
@@ -3899,7 +3913,7 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ
{
const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
if ((memoryType.propertyFlags & required) == required
&& (preferred && (memoryType.propertyFlags & preferred)))
&& (preferred && (memoryType.propertyFlags & preferred)))
{
return i;
}
@@ -3914,7 +3928,7 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ
{
const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
if ((memoryType.propertyFlags & required) == required
&& (preferred_not && !(memoryType.propertyFlags & preferred_not)))
&& (preferred_not && !(memoryType.propertyFlags & preferred_not)))
{
return i;
}
@@ -4222,7 +4236,8 @@ void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempac

void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, int cast_type_to, VkCompute& cmd, const Option& opt) const
{
int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1
: 2;

int cast_type_from_index;
if (src.elembits() == 32)


+ 2
- 1
src/gpu.h View File

@@ -419,8 +419,9 @@ public:
// helper for creating pipeline
int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline) const;
int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline, VkPipelineCache pipeline_cache = 0) const;
int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
int create_pipeline_cache(const VkPipelineCacheCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkPipelineCache* pPipelineCache) const;

uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
bool is_mappable(uint32_t memory_type_index) const;


+ 315
- 4
src/pipelinecache.cpp View File

@@ -110,13 +110,42 @@ public:
ShaderInfo shader_info; // TODO use pointer ?
};

struct spv_param
{
union
{
struct
{
int32_t shader_type_index;
uint32_t opt_bits;
};
uint64_t d0;
};
};

struct pipeline_cache_header
{
uint32_t magic = 0x5a545546;
uint32_t vendorID; // VkPhysicalDeviceProperties::vendorID
uint32_t deviceID; // VkPhysicalDeviceProperties::deviceID
uint32_t driverVersion; // VkPhysicalDeviceProperties::driverVersion
uint8_t uuid[VK_UUID_SIZE]; // VkPhysicalDeviceProperties::pipelineCacheUUID

uint32_t spv_size; // size of spirv data
uint32_t pipeline_cache_size;
};

mutable std::vector<pipeline_cache_digest> cache_digests;
mutable std::vector<pipeline_cache_artifact> cache_artifacts;

VkPipelineCache vk_pipeline_cache;
mutable std::vector<std::pair<spv_param, std::vector<uint32_t> > > cache_spirv_module; // digest(index,opt) -> spirv data

mutable Mutex cache_lock;
};

PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
{
spv_data_murmur3 = murmur3_32(spv_data, spv_data_size / 4);

@@ -134,7 +163,7 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_
}

PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(int _shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
{
shader_type_index = _shader_type_index;

@@ -160,6 +189,18 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(int _shader_t
PipelineCache::PipelineCache(const VulkanDevice* _vkdev)
: vkdev(_vkdev), d(new PipelineCachePrivate)
{
VkPipelineCacheCreateInfo pipelineCacheCreateInfo{};
pipelineCacheCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO;
pipelineCacheCreateInfo.initialDataSize = 0; // zeros for empty cache
pipelineCacheCreateInfo.pInitialData = nullptr;

int ret = 0;
ret = _vkdev->create_pipeline_cache(&pipelineCacheCreateInfo, 0, &d->vk_pipeline_cache);
if (ret != 0)
{
NCNN_LOGE("create_pipeline_cache failed %d", ret);
d->vk_pipeline_cache = 0;
}
}

PipelineCache::~PipelineCache()
@@ -381,18 +422,288 @@ int PipelineCache::get_pipeline(int shader_type_index, const Option& opt, const
return 0;
}

int PipelineCache::save_cache(std::vector<unsigned char>& buf) const
{
if (!vkdev)
{
NCNN_LOGE("vkdev is null");
return -1;
}
MutexLockGuard lock(d->cache_lock);

PipelineCachePrivate::pipeline_cache_header header;

// Platform information
header.vendorID = vkdev->info.vendor_id();
header.deviceID = vkdev->info.device_id();
header.driverVersion = vkdev->info.driver_version();
memcpy(header.uuid, vkdev->info.pipeline_cache_uuid(), VK_UUID_SIZE);

header.spv_size = d->cache_spirv_module.size();

size_t buf_size = 0;
if (vkGetPipelineCacheData(vkdev->vkdevice(), d->vk_pipeline_cache, &buf_size, nullptr) != VK_SUCCESS)
{
NCNN_LOGE("vkGetPipelineCacheData failed");
return -1;
}
header.pipeline_cache_size = (uint32_t)buf_size;

std::vector<unsigned char> pipe_data(header.pipeline_cache_size);
if (vkGetPipelineCacheData(vkdev->vkdevice(), d->vk_pipeline_cache, &buf_size, pipe_data.data()) != VK_SUCCESS)
{
NCNN_LOGE("vkGetPipelineCacheData failed");
return -1;
}

buf.resize(sizeof(header));
memcpy(buf.data(), &header, sizeof(header));

// spv_digest and spv_data
for (size_t i = 0; i < d->cache_spirv_module.size(); i++)
{
const PipelineCachePrivate::spv_param& sd = d->cache_spirv_module[i].first;
const std::vector<uint32_t>& spv_data = d->cache_spirv_module[i].second;
uint32_t size = (uint32_t)spv_data.size();

size_t current_buf_size = buf.size();
buf.resize(current_buf_size + sizeof(sd) + sizeof(size) + spv_data.size() * sizeof(uint32_t));

memcpy(buf.data() + current_buf_size, &sd, sizeof(sd));
current_buf_size += sizeof(sd);
memcpy(buf.data() + current_buf_size, &size, sizeof(size));
current_buf_size += sizeof(size);

memcpy(buf.data() + current_buf_size, spv_data.data(), spv_data.size() * sizeof(uint32_t));
}

buf.insert(buf.end(), pipe_data.begin(), pipe_data.end());
return 0;
}

int PipelineCache::load_cache(const std::vector<unsigned char>& buf) const
{
if (!vkdev)
{
NCNN_LOGE("vkdev is null");
return -1;
}
MutexLockGuard lock(d->cache_lock);

// Corrected struct name to pipeline_cache_header (lowercase h)
if (buf.size() < sizeof(PipelineCachePrivate::pipeline_cache_header))
{
NCNN_LOGE("Invalid cache buffer size: too small for header");
return -1;
}

PipelineCachePrivate::pipeline_cache_header header;
memcpy(&header, buf.data(), sizeof(header));

// Validate magic number
if (header.magic != 0x5a545546)
{
NCNN_LOGE("Invalid cache magic number");
return -1;
}

// Validate platform information for compatibility
if (header.vendorID != vkdev->info.vendor_id() || header.deviceID != vkdev->info.device_id() || header.driverVersion != vkdev->info.driver_version() || memcmp(header.uuid, vkdev->info.pipeline_cache_uuid(), VK_UUID_SIZE) != 0)
{
NCNN_LOGE("Cache platform mismatch, might be incompatible.");
return -1;
}

size_t current_offset = sizeof(header);

// Load SPIR-V data and associated spv_param
d->cache_spirv_module.reserve(header.spv_size);

for (uint32_t i = 0; i < header.spv_size; ++i)
{
if (current_offset + sizeof(PipelineCachePrivate::spv_param) + sizeof(uint32_t) > buf.size())
{
NCNN_LOGE("Invalid cache buffer size: incomplete spv_param or size for entry %u", i);
return -1;
}

PipelineCachePrivate::spv_param sd;
memcpy(&sd, buf.data() + current_offset, sizeof(sd));
current_offset += sizeof(sd);

uint32_t spv_vec_size_uint32; // Size in uint32_t units
memcpy(&spv_vec_size_uint32, buf.data() + current_offset, sizeof(spv_vec_size_uint32));
current_offset += sizeof(spv_vec_size_uint32);

size_t spv_data_byte_size = spv_vec_size_uint32 * sizeof(uint32_t);

if (current_offset + spv_data_byte_size > buf.size())
{
NCNN_LOGE("Invalid cache buffer size: incomplete spv_data for entry %u", i);
return -1;
}

std::vector<uint32_t> spirv_data(spv_vec_size_uint32);
memcpy(spirv_data.data(), buf.data() + current_offset, spv_data_byte_size);
current_offset += spv_data_byte_size;

d->cache_spirv_module.push_back({sd, spirv_data});
}

// Load Vulkan Pipeline Cache Data
if (current_offset + header.pipeline_cache_size > buf.size())
{
NCNN_LOGE("Invalid cache buffer size: incomplete pipeline cache data");
return -1;
}

if (d->vk_pipeline_cache)
{
vkDestroyPipelineCache(vkdev->vkdevice(), d->vk_pipeline_cache, 0);
d->vk_pipeline_cache = 0;
}

VkPipelineCacheCreateInfo pipelineCacheCreateInfo{};
pipelineCacheCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO;
pipelineCacheCreateInfo.initialDataSize = header.pipeline_cache_size;
pipelineCacheCreateInfo.pInitialData = buf.data() + current_offset;

int ret = vkdev->create_pipeline_cache(&pipelineCacheCreateInfo, 0, &d->vk_pipeline_cache);
if (ret != 0)
{
NCNN_LOGE("create_pipeline_cache with initial data failed %d", ret);
d->vk_pipeline_cache = 0;
return -1;
}

return 0;
}

int PipelineCache::save_cache(FILE* fp) const
{
if (!fp)
{
NCNN_LOGE("Invalid FILE pointer for saving cache.");
return -1;
}

std::vector<unsigned char> buf;
int ret = save_cache(buf);
if (ret != 0)
{
NCNN_LOGE("Failed to get cache data into buffer for saving to file.");
return ret;
}

if (fwrite(buf.data(), 1, buf.size(), fp) != buf.size())
{
NCNN_LOGE("Failed to write cache data to file.");
return -1;
}

return 0;
}

int PipelineCache::load_cache(FILE* fp) const
{
if (!fp)
{
NCNN_LOGE("Invalid FILE pointer for loading cache.");
return -1;
}

fseek(fp, 0, SEEK_END);
long file_size = ftell(fp);
fseek(fp, 0, SEEK_SET);

if (file_size < 0)
{
NCNN_LOGE("Failed to determine file size for loading cache.");
return -1;
}

std::vector<unsigned char> buf(file_size);
if (fread(buf.data(), 1, file_size, fp) != (size_t)file_size)
{
NCNN_LOGE("Failed to read cache data from file.");
return -1;
}

return load_cache(buf);
}

int PipelineCache::save_cache(const char* filename) const
{
if (!filename)
{
NCNN_LOGE("Invalid filename for saving cache.");
return -1;
}

FILE* fp = fopen(filename, "wb");
if (!fp)
{
NCNN_LOGE("Failed to open file %s for writing cache.", filename);
return -1;
}

int ret = save_cache(fp);
fclose(fp);

return ret;
}

int PipelineCache::load_cache(const char* filename) const
{
if (!filename)
{
NCNN_LOGE("Invalid filename for loading cache.");
return -1;
}

FILE* fp = fopen(filename, "rb");
if (!fp)
{
NCNN_LOGE("Failed to open file %s for reading cache.", filename);
return -1;
}

int ret = load_cache(fp);
fclose(fp);

return ret;
}

int PipelineCache::create_shader_module(int shader_type_index, const Option& opt,
uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
VkShaderModule* _shader_module, ShaderInfo& si) const
{
uint32_t opt_bits = 0 << 7
| opt.use_fp16_packed << 6
| opt.use_fp16_storage << 5
| opt.use_fp16_arithmetic << 4
| opt.use_int8_storage << 3
| opt.use_int8_arithmetic << 2;

std::vector<uint32_t> spirv;

for (int i = 0; i < d->cache_spirv_module.size(); i++)
{
if (d->cache_spirv_module[i].first.d0 == PipelineCachePrivate::spv_param({shader_type_index, opt_bits}).d0) // hit cache
{
spirv = d->cache_spirv_module[i].second;
goto hit_cache;
}
}

int retc = compile_spirv_module(shader_type_index, opt, spirv);
if (retc != 0)
{
NCNN_LOGE("compile_spirv_module failed %d", retc);
return -1;
}

d->cache_spirv_module.push_back({{shader_type_index, opt_bits}, spirv});
hit_cache:
const uint32_t* spv_data = spirv.data();
size_t spv_data_size = spirv.size() * 4;

@@ -445,7 +756,7 @@ int PipelineCache::new_pipeline(VkShaderModule shader_module, const ShaderInfo&
if (ret != 0)
goto ERROR_PipelineCache;

ret = vkdev->create_pipeline(shader_module, pipeline_layout, specializations, subgroup_size, &pipeline);
ret = vkdev->create_pipeline(shader_module, pipeline_layout, specializations, subgroup_size, &pipeline, d->vk_pipeline_cache);
if (ret != 0)
goto ERROR_PipelineCache;



+ 10
- 0
src/pipelinecache.h View File

@@ -42,6 +42,16 @@ public:
VkDescriptorUpdateTemplateKHR* descriptor_update_template,
ShaderInfo& shader_info) const;

int save_cache(std::vector<unsigned char> &buf) const;
int load_cache(const std::vector<unsigned char>& buf) const;

#ifdef NCNN_STDIO
int save_cache(FILE* fp) const;
int load_cache(FILE* fp) const;
int save_cache(const char* fp) const;
int load_cache(const char* fp) const;
#endif

protected:
int create_shader_module(int shader_type_index, const Option& opt,
uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,


+ 1
- 0
tests/CMakeLists.txt View File

@@ -65,6 +65,7 @@ ncnn_add_test(paramdict)

if(NCNN_VULKAN)
ncnn_add_test(command)
ncnn_add_test(pipecache)
endif()

if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")


+ 405
- 0
tests/test_pipecache.cpp View File

@@ -0,0 +1,405 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "datareader.h"
#include "gpu.h"
#include "mat.h"
#include "net.h"
#include "pipelinecache.h"
#include "testutil.h"

#include <iostream>
#include <chrono>
#include <vector>

// 一个空数据读取器,用于加载模型结构,权重将全部为0
class DataReaderFromEmpty : public ncnn::DataReader
{
public:
virtual int scan(const char* format, void* p) const
{
(void)format; // unused
(void)p; // unused
return 0;
}
virtual size_t read(void* buf, size_t size) const
{
memset(buf, 0, size);
return size;
}
};

// MobileNetV3 的网络结构参数
static const char* mobilenet_v3_param = R"delimiter(
7767517
145 163
Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
Convolution 313 1 1 data 313 -23330=4,3,112,112,16 0=16 1=3 3=2 4=1 5=1 6=432
Split splitncnn_0 1 2 313 313_splitncnn_0 313_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16
HardSigmoid 319 1 1 313_splitncnn_1 319 -23330=4,3,112,112,16
BinaryOp 320 2 1 313_splitncnn_0 319 320 -23330=4,3,112,112,16 0=2
Split splitncnn_1 1 2 320 320_splitncnn_0 320_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16
ConvolutionDepthWise 321 1 1 320_splitncnn_1 323 -23330=4,3,112,112,16 0=16 1=3 4=1 5=1 6=144 7=16 9=1
Convolution 324 1 1 323 324 -23330=4,3,112,112,16 0=16 1=1 5=1 6=256
BinaryOp 326 2 1 320_splitncnn_0 324 326 -23330=4,3,112,112,16
Convolution 327 1 1 326 329 -23330=4,3,112,112,64 0=64 1=1 5=1 6=1024 9=1
ConvolutionDepthWise 330 1 1 329 332 -23330=4,3,56,56,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 9=1
Convolution 333 1 1 332 333 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1536
Split splitncnn_2 1 2 333 333_splitncnn_0 333_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24
Convolution 335 1 1 333_splitncnn_1 337 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1
ConvolutionDepthWise 338 1 1 337 340 -23330=4,3,56,56,72 0=72 1=3 4=1 5=1 6=648 7=72 9=1
Convolution 341 1 1 340 341 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1728
BinaryOp 343 2 1 333_splitncnn_0 341 343 -23330=4,3,56,56,24
Convolution 344 1 1 343 346 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1
ConvolutionDepthWise 347 1 1 346 347 -23330=4,3,28,28,72 0=72 1=5 3=2 4=2 5=1 6=1800 7=72
Split splitncnn_3 1 2 347 347_splitncnn_0 347_splitncnn_1 -23330=8,3,28,28,72,3,28,28,72
Pooling 355 1 1 347_splitncnn_1 359 -23330=4,1,72,1,1 0=1 4=1
InnerProduct 360 1 1 359 361 -23330=4,1,18,1,1 0=18 1=1 2=1296 9=1
InnerProduct 362 1 1 361 362 -23330=4,1,72,1,1 0=72 1=1 2=1296
HardSigmoid 367 1 1 362 367 -23330=4,1,72,1,1
BinaryOp 376 2 1 347_splitncnn_0 367 376 -23330=4,3,28,28,72 0=2
ReLU 377 1 1 376 377 -23330=4,3,28,28,72
Convolution 378 1 1 377 378 -23330=4,3,28,28,40 0=40 1=1 5=1 6=2880
Split splitncnn_4 1 2 378 378_splitncnn_0 378_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
Convolution 380 1 1 378_splitncnn_1 382 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
ConvolutionDepthWise 383 1 1 382 383 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120
Split splitncnn_5 1 2 383 383_splitncnn_0 383_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120
Pooling 391 1 1 383_splitncnn_1 395 -23330=4,1,120,1,1 0=1 4=1
InnerProduct 396 1 1 395 397 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1
InnerProduct 398 1 1 397 398 -23330=4,1,120,1,1 0=120 1=1 2=3600
HardSigmoid 403 1 1 398 403 -23330=4,1,120,1,1
BinaryOp 412 2 1 383_splitncnn_0 403 412 -23330=4,3,28,28,120 0=2
ReLU 413 1 1 412 413 -23330=4,3,28,28,120
Convolution 414 1 1 413 414 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
BinaryOp 416 2 1 378_splitncnn_0 414 416 -23330=4,3,28,28,40
Split splitncnn_6 1 2 416 416_splitncnn_0 416_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
Convolution 417 1 1 416_splitncnn_1 419 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
ConvolutionDepthWise 420 1 1 419 420 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120
Split splitncnn_7 1 2 420 420_splitncnn_0 420_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120
Pooling 428 1 1 420_splitncnn_1 432 -23330=4,1,120,1,1 0=1 4=1
InnerProduct 433 1 1 432 434 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1
InnerProduct 435 1 1 434 435 -23330=4,1,120,1,1 0=120 1=1 2=3600
HardSigmoid 440 1 1 435 440 -23330=4,1,120,1,1
BinaryOp 449 2 1 420_splitncnn_0 440 449 -23330=4,3,28,28,120 0=2
ReLU 450 1 1 449 450 -23330=4,3,28,28,120
Convolution 451 1 1 450 451 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
BinaryOp 453 2 1 416_splitncnn_0 451 453 -23330=4,3,28,28,40
Convolution 454 1 1 453 454 -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600
HardSwish 461 1 1 454 461 -23330=4,3,28,28,240
ConvolutionDepthWise 462 1 1 461 462 -23330=4,3,14,14,240 0=240 1=3 3=2 4=1 5=1 6=2160 7=240
HardSwish 469 1 1 462 469 -23330=4,3,14,14,240
Convolution 470 1 1 469 470 -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200
Split splitncnn_8 1 2 470 470_splitncnn_0 470_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
Convolution 472 1 1 470_splitncnn_1 472 -23330=4,3,14,14,200 0=200 1=1 5=1 6=16000
HardSwish 479 1 1 472 479 -23330=4,3,14,14,200
ConvolutionDepthWise 480 1 1 479 480 -23330=4,3,14,14,200 0=200 1=3 4=1 5=1 6=1800 7=200
HardSwish 487 1 1 480 487 -23330=4,3,14,14,200
Convolution 488 1 1 487 488 -23330=4,3,14,14,80 0=80 1=1 5=1 6=16000
BinaryOp 490 2 1 470_splitncnn_0 488 490 -23330=4,3,14,14,80
Split splitncnn_9 1 2 490 490_splitncnn_0 490_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
Convolution 491 1 1 490_splitncnn_1 491 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720
HardSwish 498 1 1 491 498 -23330=4,3,14,14,184
ConvolutionDepthWise 499 1 1 498 499 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184
HardSwish 506 1 1 499 506 -23330=4,3,14,14,184
Convolution 507 1 1 506 507 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720
BinaryOp 509 2 1 490_splitncnn_0 507 509 -23330=4,3,14,14,80
Split splitncnn_10 1 2 509 509_splitncnn_0 509_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
Convolution 510 1 1 509_splitncnn_1 510 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720
HardSwish 517 1 1 510 517 -23330=4,3,14,14,184
ConvolutionDepthWise 518 1 1 517 518 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184
HardSwish 525 1 1 518 525 -23330=4,3,14,14,184
Convolution 526 1 1 525 526 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720
BinaryOp 528 2 1 509_splitncnn_0 526 528 -23330=4,3,14,14,80
Convolution 529 1 1 528 529 -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400
HardSwish 536 1 1 529 536 -23330=4,3,14,14,480
ConvolutionDepthWise 537 1 1 536 537 -23330=4,3,14,14,480 0=480 1=3 4=1 5=1 6=4320 7=480
Split splitncnn_11 1 2 537 537_splitncnn_0 537_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
Pooling 545 1 1 537_splitncnn_1 549 -23330=4,1,480,1,1 0=1 4=1
InnerProduct 550 1 1 549 551 -23330=4,1,120,1,1 0=120 1=1 2=57600 9=1
InnerProduct 552 1 1 551 552 -23330=4,1,480,1,1 0=480 1=1 2=57600
HardSigmoid 557 1 1 552 557 -23330=4,1,480,1,1
BinaryOp 566 2 1 537_splitncnn_0 557 566 -23330=4,3,14,14,480 0=2
HardSwish 572 1 1 566 572 -23330=4,3,14,14,480
Convolution 573 1 1 572 573 -23330=4,3,14,14,112 0=112 1=1 5=1 6=53760
Split splitncnn_12 1 2 573 573_splitncnn_0 573_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112
Convolution 575 1 1 573_splitncnn_1 575 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264
HardSwish 582 1 1 575 582 -23330=4,3,14,14,672
ConvolutionDepthWise 583 1 1 582 583 -23330=4,3,14,14,672 0=672 1=3 4=1 5=1 6=6048 7=672
Split splitncnn_13 1 2 583 583_splitncnn_0 583_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
Pooling 591 1 1 583_splitncnn_1 595 -23330=4,1,672,1,1 0=1 4=1
InnerProduct 596 1 1 595 597 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
InnerProduct 598 1 1 597 598 -23330=4,1,672,1,1 0=672 1=1 2=112896
HardSigmoid 603 1 1 598 603 -23330=4,1,672,1,1
BinaryOp 612 2 1 583_splitncnn_0 603 612 -23330=4,3,14,14,672 0=2
HardSwish 618 1 1 612 618 -23330=4,3,14,14,672
Convolution 619 1 1 618 619 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264
BinaryOp 621 2 1 573_splitncnn_0 619 621 -23330=4,3,14,14,112
Convolution 622 1 1 621 622 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264
HardSwish 629 1 1 622 629 -23330=4,3,14,14,672
ConvolutionDepthWise 630 1 1 629 630 -23330=4,3,14,14,672 0=672 1=5 4=2 5=1 6=16800 7=672
Split splitncnn_14 1 2 630 630_splitncnn_0 630_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
Pooling 638 1 1 630_splitncnn_1 642 -23330=4,1,672,1,1 0=1 4=1
InnerProduct 643 1 1 642 644 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
InnerProduct 645 1 1 644 645 -23330=4,1,672,1,1 0=672 1=1 2=112896
HardSigmoid 650 1 1 645 650 -23330=4,1,672,1,1
BinaryOp 659 2 1 630_splitncnn_0 650 659 -23330=4,3,14,14,672 0=2
HardSwish 665 1 1 659 665 -23330=4,3,14,14,672
Convolution 666 1 1 665 666 -23330=4,3,14,14,160 0=160 1=1 5=1 6=107520
Convolution 668 1 1 666 668 -23330=4,3,14,14,672 0=672 1=1 5=1 6=107520
HardSwish 675 1 1 668 675 -23330=4,3,14,14,672
ConvolutionDepthWise 676 1 1 675 676 -23330=4,3,7,7,672 0=672 1=5 3=2 4=2 5=1 6=16800 7=672
Split splitncnn_15 1 2 676 676_splitncnn_0 676_splitncnn_1 -23330=8,3,7,7,672,3,7,7,672
Pooling 684 1 1 676_splitncnn_1 688 -23330=4,1,672,1,1 0=1 4=1
InnerProduct 689 1 1 688 690 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
InnerProduct 691 1 1 690 691 -23330=4,1,672,1,1 0=672 1=1 2=112896
HardSigmoid 696 1 1 691 696 -23330=4,1,672,1,1
BinaryOp 705 2 1 676_splitncnn_0 696 705 -23330=4,3,7,7,672 0=2
HardSwish 711 1 1 705 711 -23330=4,3,7,7,672
Convolution 712 1 1 711 712 -23330=4,3,7,7,160 0=160 1=1 5=1 6=107520
Split splitncnn_16 1 2 712 712_splitncnn_0 712_splitncnn_1 -23330=8,3,7,7,160,3,7,7,160
Convolution 714 1 1 712_splitncnn_1 714 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600
HardSwish 721 1 1 714 721 -23330=4,3,7,7,960
ConvolutionDepthWise 722 1 1 721 722 -23330=4,3,7,7,960 0=960 1=5 4=2 5=1 6=24000 7=960
Split splitncnn_17 1 2 722 722_splitncnn_0 722_splitncnn_1 -23330=8,3,7,7,960,3,7,7,960
Pooling 730 1 1 722_splitncnn_1 734 -23330=4,1,960,1,1 0=1 4=1
InnerProduct 735 1 1 734 736 -23330=4,1,240,1,1 0=240 1=1 2=230400 9=1
InnerProduct 737 1 1 736 737 -23330=4,1,960,1,1 0=960 1=1 2=230400
HardSigmoid 742 1 1 737 742 -23330=4,1,960,1,1
BinaryOp 751 2 1 722_splitncnn_0 742 751 -23330=4,3,7,7,960 0=2
HardSwish 757 1 1 751 757 -23330=4,3,7,7,960
Convolution 758 1 1 757 758 -23330=4,3,7,7,160 0=160 1=1 5=1 6=153600
BinaryOp 760 2 1 712_splitncnn_0 758 760 -23330=4,3,7,7,160
Convolution 761 1 1 760 761 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600
HardSwish 768 1 1 761 768 -23330=4,3,7,7,960
Pooling 769 1 1 768 769 -23330=4,1,960,1,1 0=1 4=1
HardSwish 775 1 1 769 775 -23330=4,1,960,1,1
Reshape 783 1 1 775 783 -23330=4,1,960,1,1 0=-1
InnerProduct 784 1 1 783 784 -23330=4,1,1280,1,1 0=1280 1=1 2=1228800
HardSwish 790 1 1 784 790 -23330=4,1,1280,1,1
InnerProduct 791 1 1 790 791 -23330=4,1,1000,1,1 0=1000 1=1 2=1280000
Softmax prob 1 1 791 output -23330=4,1,1000,1,1
)delimiter";

/**
* @brief 使用一个简单的 Sigmoid 网络预热并测试 Pipeline Cache 的基本保存和加载功能
* @return 0 on success, -1 on failure
*/
static int warmup_gpu_pipecache()
{
std::cout << "==================================================" << std::endl;
std::cout << " Warmup: Testing Basic Cache IO " << std::endl;
std::cout << "==================================================" << std::endl;

// 1. 创建一个网络,运行一次以生成 pipeline
ncnn::Net net;
net.opt.use_vulkan_compute = true;

net.load_param_mem("7767517\n2 2\nInput input0 0 1 input0\nSigmoid sigmoid0 1 1 input0 output0");
net.load_model((unsigned char*)""); // 用于创建 pipeline

ncnn::Mat input0 = RandomMat(224, 224);
ncnn::Mat output0;
{
ncnn::Extractor ex = net.create_extractor();
ex.input("input0", input0);
ex.extract("output0", output0);
}

if (output0.empty())
{
std::cerr << "Warmup failed: initial extraction failed." << std::endl;
return -1;
}

// 2. 保存 pipeline cache
const char* cache_path = "./sigmoid_pipecache.bin";
if (net.opt.pipeline_cache->save_cache(cache_path) != 0)
{
std::cerr << "Warmup failed: could not save pipeline cache to " << cache_path << std::endl;
return -1;
}
std::cout << "Warmup: Pipeline cache saved successfully." << std::endl;

// 3. 创建第二个网络,加载刚才保存的 cache
ncnn::Net net2;
net2.opt.use_vulkan_compute = true;
net2.opt.pipeline_cache = new ncnn::PipelineCache(net.vulkan_device());

net2.load_param_mem("7767517\n2 2\nInput input0 0 1 input0\nSigmoid sigmoid0 1 1 input0 output0");
if (net2.opt.pipeline_cache->load_cache(cache_path) != 0)
{
std::cerr << "Warmup failed: could not load pipeline cache from " << cache_path << std::endl;
return -1;
}
std::cout << "Warmup: Pipeline cache loaded successfully." << std::endl;
net2.load_model((unsigned char*)""); // 创建 pipeline

// 4. 再次推理并验证结果是否一致
ncnn::Mat output0_2;
{
ncnn::Extractor ex2 = net2.create_extractor();
ex2.input("input0", input0);
ex2.extract("output0", output0_2);
}

if (output0_2.empty())
{
std::cerr << "Warmup failed: extraction after loading cache failed." << std::endl;
return -1;
}

if (CompareMat(output0, output0_2, 0.001) != 0)
{
std::cerr << "Warmup failed: output mismatch after loading cache." << std::endl;
return -1;
}

std::cout << "Warmup PASSED: Outputs are identical." << std::endl;
return 0;
}

/**
* @brief 对比使用和不使用 Pipeline Cache 时的模型加载性能
* @return 0 on success, -1 on failure
*/
static int test_gpu_pipecache_performance()
{
ncnn::Mat output_no_cache;
double time_no_cache = 0;

const char* cache_path = "./mobilenet_pipecache.bin";
DataReaderFromEmpty dr;
ncnn::Mat input = RandomMat(224, 224, 3);

// -------------------------------------------------
// 1. 不使用 Pipeline Cache (首次加载)
// -------------------------------------------------
std::cout << "\n==================================================" << std::endl;
std::cout << " Performance Test: Without Pipeline Cache " << std::endl;
std::cout << "==================================================" << std::endl;
{
ncnn::Net net_no_cache;
net_no_cache.opt.use_vulkan_compute = true;

auto start = std::chrono::high_resolution_clock::now();

net_no_cache.load_param_mem(mobilenet_v3_param);
net_no_cache.load_model(dr);

auto end = std::chrono::high_resolution_clock::now();
time_no_cache = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(end - start).count();
std::cout << "Model loading time without cache: " << time_no_cache << " ms" << std::endl;

// 推理以获得基准输出
ncnn::Extractor ex = net_no_cache.create_extractor();
ex.input("data", input);
ex.extract("output", output_no_cache);

if (output_no_cache.empty())
{
std::cerr << "Test failed: extraction without cache failed." << std::endl;
return -1;
}

// 保存 cache 以供下一步使用
if (net_no_cache.opt.pipeline_cache->save_cache(cache_path) != 0)
{
std::cerr << "Test failed: could not save pipeline cache to " << cache_path << std::endl;
return -1;
}
std::cout << "Pipeline cache generated and saved to " << cache_path << std::endl;
}

// -------------------------------------------------
// 2. 使用 Pipeline Cache (二次加载)
// -------------------------------------------------
ncnn::Mat output_with_cache;
double time_with_cache = 0;
std::cout << "\n==================================================" << std::endl;
std::cout << " Performance Test: With Pipeline Cache " << std::endl;
std::cout << "==================================================" << std::endl;
{
ncnn::Net net_with_cache;
// 必须在加载模型前设置好 cache
net_with_cache.opt.pipeline_cache = new ncnn::PipelineCache(ncnn::get_gpu_device());
net_with_cache.opt.use_vulkan_compute = true;

auto start = std::chrono::high_resolution_clock::now();

// 从文件加载 cache
if (net_with_cache.opt.pipeline_cache->load_cache(cache_path) != 0)
{
std::cerr << "Test failed: could not load pipeline cache from " << cache_path << std::endl;
return -1;
}
net_with_cache.load_param_mem(mobilenet_v3_param);
net_with_cache.load_model(dr);

auto end = std::chrono::high_resolution_clock::now();
time_with_cache = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(end - start).count();
std::cout << "Model loading time with cache: " << time_with_cache << " ms" << std::endl;

// 推理
ncnn::Extractor ex2 = net_with_cache.create_extractor();
ex2.input("data", input);
ex2.extract("output", output_with_cache);

if (output_with_cache.empty())
{
std::cerr << "Test failed: extraction with cache failed." << std::endl;
return -1;
}
}

// -------------------------------------------------
// 3. 结果验证与总结
// -------------------------------------------------
std::cout << "\n==================================================" << std::endl;
std::cout << " Verification and Summary " << std::endl;
std::cout << "==================================================" << std::endl;

bool is_output_same = (CompareMat(output_no_cache, output_with_cache, 0.001) == 0);

std::cout << "Output verification: " << (is_output_same ? "SUCCESS" : "FAILURE") << std::endl;
std::cout << "--------------------------------------------------" << std::endl;
std::cout << "Performance Summary:" << std::endl;
std::cout << " - Without Cache: " << time_no_cache << " ms" << std::endl;
std::cout << " - With Cache: " << time_with_cache << " ms" << std::endl;

if (time_no_cache > 0) {
double speedup = (time_no_cache - time_with_cache) / time_no_cache * 100;
std::cout << " - Speedup: " << speedup << "%" << std::endl;
}

if (!is_output_same)
{
std::cerr << "\nTest FAILED due to output mismatch." << std::endl;
return -1;
}

std::cout << "\nTest PASSED." << std::endl;
return 0;
}

int main()
{
// 运行预热测试,检查基本IO功能
if (warmup_gpu_pipecache() != 0)
{
return -1;
}

// 运行性能对比测试
return test_gpu_pipecache_performance();
}

Loading…
Cancel
Save