feat: pipe & spv cache

11 months ago · 7a0c19c856
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -98,6 +98,7 @@ struct layer_shader_registry_entry

 static const layer_shader_registry_entry layer_shader_registry[] = {
 #include "layer_shader_registry.h"

 };

 static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry);
@@ -476,16 +477,16 @@ void GpuInfoPrivate::query_properties()
    }

    if (physicalDeviceProperties.vendorID == 0x13b5
            && (physicalDeviceProperties.deviceID == 0x7500001
                || physicalDeviceProperties.deviceID == 0x7501000
                || physicalDeviceProperties.deviceID == 0x8602000
                || physicalDeviceProperties.deviceID == 0x8800020
                || physicalDeviceProperties.deviceID == 0x70930000
                || physicalDeviceProperties.deviceID == 0x70901010
                || physicalDeviceProperties.deviceID == 0x72120000
                || physicalDeviceProperties.deviceID == 0x74021000
                || physicalDeviceProperties.deviceID == 0x60a00002
                || physicalDeviceProperties.deviceID == 0x62210001))
        && (physicalDeviceProperties.deviceID == 0x7500001
            || physicalDeviceProperties.deviceID == 0x7501000
            || physicalDeviceProperties.deviceID == 0x8602000
            || physicalDeviceProperties.deviceID == 0x8800020
            || physicalDeviceProperties.deviceID == 0x70930000
            || physicalDeviceProperties.deviceID == 0x70901010
            || physicalDeviceProperties.deviceID == 0x72120000
            || physicalDeviceProperties.deviceID == 0x74021000
            || physicalDeviceProperties.deviceID == 0x60a00002
            || physicalDeviceProperties.deviceID == 0x62210001))
    {
        // NOTE rk3288/rk3399/t880/g31/g51/g52/g71/g72
        // however, g76/g77 has explicit fp16 arithmetic
@@ -494,9 +495,9 @@ void GpuInfoPrivate::query_properties()
    }

    if (physicalDeviceProperties.vendorID == 0x5143
            && (physicalDeviceProperties.deviceID == 0x6030001
                || physicalDeviceProperties.deviceID == 0x6040001
                || physicalDeviceProperties.deviceID == 0x6050002))
        && (physicalDeviceProperties.deviceID == 0x6030001
            || physicalDeviceProperties.deviceID == 0x6040001
            || physicalDeviceProperties.deviceID == 0x6050002))
    {
        // TODO enable devices other than qcom845/qcom855/qcom855plus/qcom865
        // qcom adreno driver accept spirv with fp16 arithmetic
@@ -512,7 +513,7 @@ static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyPropert
        const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];

        if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
                && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
            && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
        {
            return i;
        }
@@ -524,7 +525,7 @@ static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyPropert
        const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];

        if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
                && (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
            && (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
        {
            return i;
        }
@@ -553,8 +554,8 @@ static uint32_t find_device_transfer_queue(const std::vector<VkQueueFamilyProper
        const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];

        if ((queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT)
                && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
                && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
            && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
            && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
        {
            return i;
        }
@@ -1137,30 +1138,30 @@ void GpuInfoPrivate::query_extension_properties()
            // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope);

            if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
                    && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
                && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
                && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
                && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
            {
                support_cooperative_matrix_8_8_16 = true;
            }
            if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
                    && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
                && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
                && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
                && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
            {
                support_cooperative_matrix_16_8_8 = true;
            }
            if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16
                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
                    && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
                && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
                && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
                && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
            {
                support_cooperative_matrix_16_8_16 = true;
            }
            if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16
                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
                    && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
                && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
                && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
                && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
            {
                support_cooperative_matrix_16_16_16 = true;
            }
@@ -1194,30 +1195,30 @@ void GpuInfoPrivate::query_extension_properties()
            // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope);

            if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
                    && cmp.scope == VK_SCOPE_SUBGROUP_NV)
                && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
                && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
                && cmp.scope == VK_SCOPE_SUBGROUP_NV)
            {
                support_cooperative_matrix_8_8_16 = true;
            }
            if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
                    && cmp.scope == VK_SCOPE_SUBGROUP_NV)
                && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
                && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
                && cmp.scope == VK_SCOPE_SUBGROUP_NV)
            {
                support_cooperative_matrix_16_8_8 = true;
            }
            if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16
                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
                    && cmp.scope == VK_SCOPE_SUBGROUP_NV)
                && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
                && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
                && cmp.scope == VK_SCOPE_SUBGROUP_NV)
            {
                support_cooperative_matrix_16_8_16 = true;
            }
            if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16
                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
                    && cmp.scope == VK_SCOPE_SUBGROUP_NV)
                && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
                && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
                && cmp.scope == VK_SCOPE_SUBGROUP_NV)
            {
                support_cooperative_matrix_16_16_16 = true;
            }
@@ -2031,8 +2032,8 @@ void GpuInfo::get_optimal_cooperative_matrix_mnk(int M, int N, int K, VkComponen
            const VkCooperativeMatrixPropertiesKHR& cmp = d->queryCooperativeMatrixSubProperties[i];

            if (cmp.AType == type && cmp.BType == type
                    && cmp.CType == acctype && cmp.ResultType == acctype
                    && cmp.scope == scope)
                && cmp.CType == acctype && cmp.ResultType == acctype
                && cmp.scope == scope)
            {
                mnk_properties.push_back(cmp);
            }
@@ -2045,8 +2046,8 @@ void GpuInfo::get_optimal_cooperative_matrix_mnk(int M, int N, int K, VkComponen
            const VkCooperativeMatrixPropertiesNV& cmp = d->queryCooperativeMatrixSubPropertiesNV[i];

            if (cmp.AType == (VkComponentTypeNV)type && cmp.BType == (VkComponentTypeNV)type
                    && cmp.CType == (VkComponentTypeNV)acctype && cmp.DType == (VkComponentTypeNV)acctype
                    && cmp.scope == (VkScopeNV)scope)
                && cmp.CType == (VkComponentTypeNV)acctype && cmp.DType == (VkComponentTypeNV)acctype
                && cmp.scope == (VkScopeNV)scope)
            {
                VkCooperativeMatrixPropertiesKHR cmp_khr;
                cmp_khr.MSize = cmp.MSize;
@@ -2459,7 +2460,7 @@ int create_gpu_instance(const char* driver_path)
 #endif // __ANDROID_API__ >= 26

    uint32_t instance_api_version = VK_MAKE_VERSION(1, 0, 0);
    typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t * pApiVersion);
    typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t* pApiVersion);
    PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion = (PFN_vkEnumerateInstanceVersion)vkGetInstanceProcAddr(0, "vkEnumerateInstanceVersion");
    if (vkEnumerateInstanceVersion)
    {
@@ -2672,7 +2673,7 @@ int create_gpu_instance(const char* driver_path)
                        fp16_matrix_properties.push_back(cmp);
                }
                if ((cmp.AType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.AType == VK_COMPONENT_TYPE_SINT8_PACKED_NV)
                        && (cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.BType == VK_COMPONENT_TYPE_SINT8_PACKED_NV))
                    && (cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.BType == VK_COMPONENT_TYPE_SINT8_PACKED_NV))
                {
                    bool mnk_hit = false;
                    for (size_t k = 0; k < int8_matrix_properties.size(); k++)
@@ -2703,9 +2704,9 @@ int create_gpu_instance(const char* driver_path)
                        bf16_matrix_properties.push_back(cmp);
                }
                if ((cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT
                        || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV)
                        && (cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT
                            || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV))
                     || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV)
                    && (cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT
                        || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV))
                {
                    bool mnk_hit = false;
                    for (size_t k = 0; k < fp8_matrix_properties.size(); k++)
@@ -3143,8 +3144,9 @@ const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_
    uop->vkdev = vkdev;

    ncnn::ParamDict pd;
    pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 : 8); // out_elempack
    pd.set(2, cast_type_from_index + 1);                                            // 0=auto 1=fp32 2=fp16 3=int8
    pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4
                                                                          : 8); // out_elempack
    pd.set(2, cast_type_from_index + 1);                                        // 0=auto 1=fp32 2=fp16 3=int8
    pd.set(3, cast_type_to_index + 1);

    uop->load_param(pd);
@@ -3734,7 +3736,7 @@ int VulkanDevice::create_pipeline_layout(int push_constant_count, VkDescriptorSe
    return 0;
 }

 int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline) const
 int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline, VkPipelineCache pipeline_cache) const
 {
    const int specialization_count = specializations.size();

@@ -3792,7 +3794,7 @@ int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout
    computePipelineCreateInfo.basePipelineHandle = 0;
    computePipelineCreateInfo.basePipelineIndex = 0;

    VkResult ret = vkCreateComputePipelines(d->device, 0, 1, &computePipelineCreateInfo, 0, pipeline);
    VkResult ret = vkCreateComputePipelines(d->device, pipeline_cache, 1, &computePipelineCreateInfo, 0, pipeline);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkCreateComputePipelines failed %d", ret);
@@ -3871,6 +3873,18 @@ int VulkanDevice::create_descriptor_update_template(int binding_count, const int
    return 0;
 }

 int VulkanDevice::create_pipeline_cache(const VkPipelineCacheCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkPipelineCache* pPipelineCache) const
 {
    VkResult ret = vkCreatePipelineCache(d->device, pCreateInfo, pAllocator, pPipelineCache);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkCreatePipelineCache failed %d", ret);
        return -1;
    }

    return 0;
 }

 uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const
 {
    const VkPhysicalDeviceMemoryProperties& memory_properties = info.physicalDeviceMemoryProperties();
@@ -3883,8 +3897,8 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ
        {
            const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
            if ((memoryType.propertyFlags & required) == required
                    && (preferred && (memoryType.propertyFlags & preferred))
                    && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
                && (preferred && (memoryType.propertyFlags & preferred))
                && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
            {
                return i;
            }
@@ -3899,7 +3913,7 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ
        {
            const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
            if ((memoryType.propertyFlags & required) == required
                    && (preferred && (memoryType.propertyFlags & preferred)))
                && (preferred && (memoryType.propertyFlags & preferred)))
            {
                return i;
            }
@@ -3914,7 +3928,7 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ
        {
            const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
            if ((memoryType.propertyFlags & required) == required
                    && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
                && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
            {
                return i;
            }
@@ -4222,7 +4236,8 @@ void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempac

 void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, int cast_type_to, VkCompute& cmd, const Option& opt) const
 {
    int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
    int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1
                                                                          : 2;

    int cast_type_from_index;
    if (src.elembits() == 32)
--- a/src/gpu.h
+++ b/src/gpu.h
@@ -419,8 +419,9 @@ public:
    // helper for creating pipeline
    int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
    int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline) const;
    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline, VkPipelineCache pipeline_cache = 0) const;
    int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
    int create_pipeline_cache(const VkPipelineCacheCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkPipelineCache* pPipelineCache) const;

    uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
    bool is_mappable(uint32_t memory_type_index) const;
--- a/src/pipelinecache.cpp
+++ b/src/pipelinecache.cpp
@@ -110,13 +110,42 @@ public:
        ShaderInfo shader_info; // TODO use pointer ?
    };

    struct spv_param
    {
        union
        {
            struct
            {
                int32_t shader_type_index;
                uint32_t opt_bits;
            };
            uint64_t d0;
        };
    };

    struct pipeline_cache_header
    {
        uint32_t magic = 0x5a545546;
        uint32_t vendorID;          // VkPhysicalDeviceProperties::vendorID
        uint32_t deviceID;          // VkPhysicalDeviceProperties::deviceID
        uint32_t driverVersion;     // VkPhysicalDeviceProperties::driverVersion
        uint8_t uuid[VK_UUID_SIZE]; // VkPhysicalDeviceProperties::pipelineCacheUUID

        uint32_t spv_size; // size of spirv data
        uint32_t pipeline_cache_size;
    };

    mutable std::vector<pipeline_cache_digest> cache_digests;
    mutable std::vector<pipeline_cache_artifact> cache_artifacts;

    VkPipelineCache vk_pipeline_cache;
    mutable std::vector<std::pair<spv_param, std::vector<uint32_t> > > cache_spirv_module; // digest(index,opt) -> spirv data

    mutable Mutex cache_lock;
 };

 PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
        uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
                                                                   uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
 {
    spv_data_murmur3 = murmur3_32(spv_data, spv_data_size / 4);

@@ -134,7 +163,7 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_
 }

 PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(int _shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
        uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
                                                                   uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
 {
    shader_type_index = _shader_type_index;

@@ -160,6 +189,18 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(int _shader_t
 PipelineCache::PipelineCache(const VulkanDevice* _vkdev)
    : vkdev(_vkdev), d(new PipelineCachePrivate)
 {
    VkPipelineCacheCreateInfo pipelineCacheCreateInfo{};
    pipelineCacheCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO;
    pipelineCacheCreateInfo.initialDataSize = 0; // zeros for empty cache
    pipelineCacheCreateInfo.pInitialData = nullptr;

    int ret = 0;
    ret = _vkdev->create_pipeline_cache(&pipelineCacheCreateInfo, 0, &d->vk_pipeline_cache);
    if (ret != 0)
    {
        NCNN_LOGE("create_pipeline_cache failed %d", ret);
        d->vk_pipeline_cache = 0;
    }
 }

 PipelineCache::~PipelineCache()
@@ -381,18 +422,288 @@ int PipelineCache::get_pipeline(int shader_type_index, const Option& opt, const
    return 0;
 }

 int PipelineCache::save_cache(std::vector<unsigned char>& buf) const
 {
    if (!vkdev)
    {
        NCNN_LOGE("vkdev is null");
        return -1;
    }
    MutexLockGuard lock(d->cache_lock);

    PipelineCachePrivate::pipeline_cache_header header;

    // Platform information
    header.vendorID = vkdev->info.vendor_id();
    header.deviceID = vkdev->info.device_id();
    header.driverVersion = vkdev->info.driver_version();
    memcpy(header.uuid, vkdev->info.pipeline_cache_uuid(), VK_UUID_SIZE);

    header.spv_size = d->cache_spirv_module.size();

    size_t buf_size = 0;
    if (vkGetPipelineCacheData(vkdev->vkdevice(), d->vk_pipeline_cache, &buf_size, nullptr) != VK_SUCCESS)
    {
        NCNN_LOGE("vkGetPipelineCacheData failed");
        return -1;
    }
    header.pipeline_cache_size = (uint32_t)buf_size;

    std::vector<unsigned char> pipe_data(header.pipeline_cache_size);
    if (vkGetPipelineCacheData(vkdev->vkdevice(), d->vk_pipeline_cache, &buf_size, pipe_data.data()) != VK_SUCCESS)
    {
        NCNN_LOGE("vkGetPipelineCacheData failed");
        return -1;
    }

    buf.resize(sizeof(header));
    memcpy(buf.data(), &header, sizeof(header));

    // spv_digest and spv_data
    for (size_t i = 0; i < d->cache_spirv_module.size(); i++)
    {
        const PipelineCachePrivate::spv_param& sd = d->cache_spirv_module[i].first;
        const std::vector<uint32_t>& spv_data = d->cache_spirv_module[i].second;
        uint32_t size = (uint32_t)spv_data.size();

        size_t current_buf_size = buf.size();
        buf.resize(current_buf_size + sizeof(sd) + sizeof(size) + spv_data.size() * sizeof(uint32_t));

        memcpy(buf.data() + current_buf_size, &sd, sizeof(sd));
        current_buf_size += sizeof(sd);
        memcpy(buf.data() + current_buf_size, &size, sizeof(size));
        current_buf_size += sizeof(size);

        memcpy(buf.data() + current_buf_size, spv_data.data(), spv_data.size() * sizeof(uint32_t));
    }

    buf.insert(buf.end(), pipe_data.begin(), pipe_data.end());
    return 0;
 }

 int PipelineCache::load_cache(const std::vector<unsigned char>& buf) const
 {
    if (!vkdev)
    {
        NCNN_LOGE("vkdev is null");
        return -1;
    }
    MutexLockGuard lock(d->cache_lock);

    // Corrected struct name to pipeline_cache_header (lowercase h)
    if (buf.size() < sizeof(PipelineCachePrivate::pipeline_cache_header))
    {
        NCNN_LOGE("Invalid cache buffer size: too small for header");
        return -1;
    }

    PipelineCachePrivate::pipeline_cache_header header;
    memcpy(&header, buf.data(), sizeof(header));

    // Validate magic number
    if (header.magic != 0x5a545546)
    {
        NCNN_LOGE("Invalid cache magic number");
        return -1;
    }

    // Validate platform information for compatibility
    if (header.vendorID != vkdev->info.vendor_id() || header.deviceID != vkdev->info.device_id() || header.driverVersion != vkdev->info.driver_version() || memcmp(header.uuid, vkdev->info.pipeline_cache_uuid(), VK_UUID_SIZE) != 0)
    {
        NCNN_LOGE("Cache platform mismatch, might be incompatible.");
        return -1;
    }

    size_t current_offset = sizeof(header);

    // Load SPIR-V data and associated spv_param
    d->cache_spirv_module.reserve(header.spv_size);

    for (uint32_t i = 0; i < header.spv_size; ++i)
    {
        if (current_offset + sizeof(PipelineCachePrivate::spv_param) + sizeof(uint32_t) > buf.size())
        {
            NCNN_LOGE("Invalid cache buffer size: incomplete spv_param or size for entry %u", i);
            return -1;
        }

        PipelineCachePrivate::spv_param sd;
        memcpy(&sd, buf.data() + current_offset, sizeof(sd));
        current_offset += sizeof(sd);

        uint32_t spv_vec_size_uint32; // Size in uint32_t units
        memcpy(&spv_vec_size_uint32, buf.data() + current_offset, sizeof(spv_vec_size_uint32));
        current_offset += sizeof(spv_vec_size_uint32);

        size_t spv_data_byte_size = spv_vec_size_uint32 * sizeof(uint32_t);

        if (current_offset + spv_data_byte_size > buf.size())
        {
            NCNN_LOGE("Invalid cache buffer size: incomplete spv_data for entry %u", i);
            return -1;
        }

        std::vector<uint32_t> spirv_data(spv_vec_size_uint32);
        memcpy(spirv_data.data(), buf.data() + current_offset, spv_data_byte_size);
        current_offset += spv_data_byte_size;

        d->cache_spirv_module.push_back({sd, spirv_data});
    }

    // Load Vulkan Pipeline Cache Data
    if (current_offset + header.pipeline_cache_size > buf.size())
    {
        NCNN_LOGE("Invalid cache buffer size: incomplete pipeline cache data");
        return -1;
    }

    if (d->vk_pipeline_cache)
    {
        vkDestroyPipelineCache(vkdev->vkdevice(), d->vk_pipeline_cache, 0);
        d->vk_pipeline_cache = 0;
    }

    VkPipelineCacheCreateInfo pipelineCacheCreateInfo{};
    pipelineCacheCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO;
    pipelineCacheCreateInfo.initialDataSize = header.pipeline_cache_size;
    pipelineCacheCreateInfo.pInitialData = buf.data() + current_offset;

    int ret = vkdev->create_pipeline_cache(&pipelineCacheCreateInfo, 0, &d->vk_pipeline_cache);
    if (ret != 0)
    {
        NCNN_LOGE("create_pipeline_cache with initial data failed %d", ret);
        d->vk_pipeline_cache = 0;
        return -1;
    }

    return 0;
 }

 int PipelineCache::save_cache(FILE* fp) const
 {
    if (!fp)
    {
        NCNN_LOGE("Invalid FILE pointer for saving cache.");
        return -1;
    }

    std::vector<unsigned char> buf;
    int ret = save_cache(buf);
    if (ret != 0)
    {
        NCNN_LOGE("Failed to get cache data into buffer for saving to file.");
        return ret;
    }

    if (fwrite(buf.data(), 1, buf.size(), fp) != buf.size())
    {
        NCNN_LOGE("Failed to write cache data to file.");
        return -1;
    }

    return 0;
 }

 int PipelineCache::load_cache(FILE* fp) const
 {
    if (!fp)
    {
        NCNN_LOGE("Invalid FILE pointer for loading cache.");
        return -1;
    }

    fseek(fp, 0, SEEK_END);
    long file_size = ftell(fp);
    fseek(fp, 0, SEEK_SET);

    if (file_size < 0)
    {
        NCNN_LOGE("Failed to determine file size for loading cache.");
        return -1;
    }

    std::vector<unsigned char> buf(file_size);
    if (fread(buf.data(), 1, file_size, fp) != (size_t)file_size)
    {
        NCNN_LOGE("Failed to read cache data from file.");
        return -1;
    }

    return load_cache(buf);
 }

 int PipelineCache::save_cache(const char* filename) const
 {
    if (!filename)
    {
        NCNN_LOGE("Invalid filename for saving cache.");
        return -1;
    }

    FILE* fp = fopen(filename, "wb");
    if (!fp)
    {
        NCNN_LOGE("Failed to open file %s for writing cache.", filename);
        return -1;
    }

    int ret = save_cache(fp);
    fclose(fp);

    return ret;
 }

 int PipelineCache::load_cache(const char* filename) const
 {
    if (!filename)
    {
        NCNN_LOGE("Invalid filename for loading cache.");
        return -1;
    }

    FILE* fp = fopen(filename, "rb");
    if (!fp)
    {
        NCNN_LOGE("Failed to open file %s for reading cache.", filename);
        return -1;
    }

    int ret = load_cache(fp);
    fclose(fp);

    return ret;
 }

 int PipelineCache::create_shader_module(int shader_type_index, const Option& opt,
                                        uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
                                        VkShaderModule* _shader_module, ShaderInfo& si) const
 {
    uint32_t opt_bits = 0 << 7
                        | opt.use_fp16_packed << 6
                        | opt.use_fp16_storage << 5
                        | opt.use_fp16_arithmetic << 4
                        | opt.use_int8_storage << 3
                        | opt.use_int8_arithmetic << 2;

    std::vector<uint32_t> spirv;

    for (int i = 0; i < d->cache_spirv_module.size(); i++)
    {
        if (d->cache_spirv_module[i].first.d0 == PipelineCachePrivate::spv_param({shader_type_index, opt_bits}).d0) // hit cache
        {
            spirv = d->cache_spirv_module[i].second;
            goto hit_cache;
        }
    }

    int retc = compile_spirv_module(shader_type_index, opt, spirv);
    if (retc != 0)
    {
        NCNN_LOGE("compile_spirv_module failed %d", retc);
        return -1;
    }

    d->cache_spirv_module.push_back({{shader_type_index, opt_bits}, spirv});
 hit_cache:
    const uint32_t* spv_data = spirv.data();
    size_t spv_data_size = spirv.size() * 4;

@@ -445,7 +756,7 @@ int PipelineCache::new_pipeline(VkShaderModule shader_module, const ShaderInfo&
    if (ret != 0)
        goto ERROR_PipelineCache;

    ret = vkdev->create_pipeline(shader_module, pipeline_layout, specializations, subgroup_size, &pipeline);
    ret = vkdev->create_pipeline(shader_module, pipeline_layout, specializations, subgroup_size, &pipeline, d->vk_pipeline_cache);
    if (ret != 0)
        goto ERROR_PipelineCache;

--- a/src/pipelinecache.h
+++ b/src/pipelinecache.h
@@ -42,6 +42,16 @@ public:
                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
                     ShaderInfo& shader_info) const;

    int save_cache(std::vector<unsigned char> &buf) const;
    int load_cache(const std::vector<unsigned char>& buf) const;

 #ifdef NCNN_STDIO
    int save_cache(FILE* fp) const;
    int load_cache(FILE* fp) const;
    int save_cache(const char* fp) const;
    int load_cache(const char* fp) const;
 #endif

 protected:
    int create_shader_module(int shader_type_index, const Option& opt,
                             uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -65,6 +65,7 @@ ncnn_add_test(paramdict)

 if(NCNN_VULKAN)
    ncnn_add_test(command)
    ncnn_add_test(pipecache)
 endif()

 if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
--- a/tests/test_pipecache.cpp
+++ b/tests/test_pipecache.cpp
@@ -0,0 +1,405 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "datareader.h"
 #include "gpu.h"
 #include "mat.h"
 #include "net.h"
 #include "pipelinecache.h"
 #include "testutil.h"

 #include <iostream>
 #include <chrono>
 #include <vector>

 // 一个空数据读取器，用于加载模型结构，权重将全部为0
 class DataReaderFromEmpty : public ncnn::DataReader
 {
 public:
    virtual int scan(const char* format, void* p) const
    {
        (void)format; // unused
        (void)p;      // unused
        return 0;
    }
    virtual size_t read(void* buf, size_t size) const
    {
        memset(buf, 0, size);
        return size;
    }
 };

 // MobileNetV3 的网络结构参数
 static const char* mobilenet_v3_param = R"delimiter(
 7767517
 145 163
 Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
 Convolution              313                      1 1 data 313 -23330=4,3,112,112,16 0=16 1=3 3=2 4=1 5=1 6=432
 Split                    splitncnn_0              1 2 313 313_splitncnn_0 313_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16
 HardSigmoid              319                      1 1 313_splitncnn_1 319 -23330=4,3,112,112,16
 BinaryOp                 320                      2 1 313_splitncnn_0 319 320 -23330=4,3,112,112,16 0=2
 Split                    splitncnn_1              1 2 320 320_splitncnn_0 320_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16
 ConvolutionDepthWise     321                      1 1 320_splitncnn_1 323 -23330=4,3,112,112,16 0=16 1=3 4=1 5=1 6=144 7=16 9=1
 Convolution              324                      1 1 323 324 -23330=4,3,112,112,16 0=16 1=1 5=1 6=256
 BinaryOp                 326                      2 1 320_splitncnn_0 324 326 -23330=4,3,112,112,16
 Convolution              327                      1 1 326 329 -23330=4,3,112,112,64 0=64 1=1 5=1 6=1024 9=1
 ConvolutionDepthWise     330                      1 1 329 332 -23330=4,3,56,56,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 9=1
 Convolution              333                      1 1 332 333 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1536
 Split                    splitncnn_2              1 2 333 333_splitncnn_0 333_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24
 Convolution              335                      1 1 333_splitncnn_1 337 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1
 ConvolutionDepthWise     338                      1 1 337 340 -23330=4,3,56,56,72 0=72 1=3 4=1 5=1 6=648 7=72 9=1
 Convolution              341                      1 1 340 341 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1728
 BinaryOp                 343                      2 1 333_splitncnn_0 341 343 -23330=4,3,56,56,24
 Convolution              344                      1 1 343 346 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1
 ConvolutionDepthWise     347                      1 1 346 347 -23330=4,3,28,28,72 0=72 1=5 3=2 4=2 5=1 6=1800 7=72
 Split                    splitncnn_3              1 2 347 347_splitncnn_0 347_splitncnn_1 -23330=8,3,28,28,72,3,28,28,72
 Pooling                  355                      1 1 347_splitncnn_1 359 -23330=4,1,72,1,1 0=1 4=1
 InnerProduct             360                      1 1 359 361 -23330=4,1,18,1,1 0=18 1=1 2=1296 9=1
 InnerProduct             362                      1 1 361 362 -23330=4,1,72,1,1 0=72 1=1 2=1296
 HardSigmoid              367                      1 1 362 367 -23330=4,1,72,1,1
 BinaryOp                 376                      2 1 347_splitncnn_0 367 376 -23330=4,3,28,28,72 0=2
 ReLU                     377                      1 1 376 377 -23330=4,3,28,28,72
 Convolution              378                      1 1 377 378 -23330=4,3,28,28,40 0=40 1=1 5=1 6=2880
 Split                    splitncnn_4              1 2 378 378_splitncnn_0 378_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
 Convolution              380                      1 1 378_splitncnn_1 382 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
 ConvolutionDepthWise     383                      1 1 382 383 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120
 Split                    splitncnn_5              1 2 383 383_splitncnn_0 383_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120
 Pooling                  391                      1 1 383_splitncnn_1 395 -23330=4,1,120,1,1 0=1 4=1
 InnerProduct             396                      1 1 395 397 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1
 InnerProduct             398                      1 1 397 398 -23330=4,1,120,1,1 0=120 1=1 2=3600
 HardSigmoid              403                      1 1 398 403 -23330=4,1,120,1,1
 BinaryOp                 412                      2 1 383_splitncnn_0 403 412 -23330=4,3,28,28,120 0=2
 ReLU                     413                      1 1 412 413 -23330=4,3,28,28,120
 Convolution              414                      1 1 413 414 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
 BinaryOp                 416                      2 1 378_splitncnn_0 414 416 -23330=4,3,28,28,40
 Split                    splitncnn_6              1 2 416 416_splitncnn_0 416_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
 Convolution              417                      1 1 416_splitncnn_1 419 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
 ConvolutionDepthWise     420                      1 1 419 420 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120
 Split                    splitncnn_7              1 2 420 420_splitncnn_0 420_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120
 Pooling                  428                      1 1 420_splitncnn_1 432 -23330=4,1,120,1,1 0=1 4=1
 InnerProduct             433                      1 1 432 434 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1
 InnerProduct             435                      1 1 434 435 -23330=4,1,120,1,1 0=120 1=1 2=3600
 HardSigmoid              440                      1 1 435 440 -23330=4,1,120,1,1
 BinaryOp                 449                      2 1 420_splitncnn_0 440 449 -23330=4,3,28,28,120 0=2
 ReLU                     450                      1 1 449 450 -23330=4,3,28,28,120
 Convolution              451                      1 1 450 451 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
 BinaryOp                 453                      2 1 416_splitncnn_0 451 453 -23330=4,3,28,28,40
 Convolution              454                      1 1 453 454 -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600
 HardSwish                461                      1 1 454 461 -23330=4,3,28,28,240
 ConvolutionDepthWise     462                      1 1 461 462 -23330=4,3,14,14,240 0=240 1=3 3=2 4=1 5=1 6=2160 7=240
 HardSwish                469                      1 1 462 469 -23330=4,3,14,14,240
 Convolution              470                      1 1 469 470 -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200
 Split                    splitncnn_8              1 2 470 470_splitncnn_0 470_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
 Convolution              472                      1 1 470_splitncnn_1 472 -23330=4,3,14,14,200 0=200 1=1 5=1 6=16000
 HardSwish                479                      1 1 472 479 -23330=4,3,14,14,200
 ConvolutionDepthWise     480                      1 1 479 480 -23330=4,3,14,14,200 0=200 1=3 4=1 5=1 6=1800 7=200
 HardSwish                487                      1 1 480 487 -23330=4,3,14,14,200
 Convolution              488                      1 1 487 488 -23330=4,3,14,14,80 0=80 1=1 5=1 6=16000
 BinaryOp                 490                      2 1 470_splitncnn_0 488 490 -23330=4,3,14,14,80
 Split                    splitncnn_9              1 2 490 490_splitncnn_0 490_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
 Convolution              491                      1 1 490_splitncnn_1 491 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720
 HardSwish                498                      1 1 491 498 -23330=4,3,14,14,184
 ConvolutionDepthWise     499                      1 1 498 499 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184
 HardSwish                506                      1 1 499 506 -23330=4,3,14,14,184
 Convolution              507                      1 1 506 507 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720
 BinaryOp                 509                      2 1 490_splitncnn_0 507 509 -23330=4,3,14,14,80
 Split                    splitncnn_10             1 2 509 509_splitncnn_0 509_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
 Convolution              510                      1 1 509_splitncnn_1 510 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720
 HardSwish                517                      1 1 510 517 -23330=4,3,14,14,184
 ConvolutionDepthWise     518                      1 1 517 518 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184
 HardSwish                525                      1 1 518 525 -23330=4,3,14,14,184
 Convolution              526                      1 1 525 526 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720
 BinaryOp                 528                      2 1 509_splitncnn_0 526 528 -23330=4,3,14,14,80
 Convolution              529                      1 1 528 529 -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400
 HardSwish                536                      1 1 529 536 -23330=4,3,14,14,480
 ConvolutionDepthWise     537                      1 1 536 537 -23330=4,3,14,14,480 0=480 1=3 4=1 5=1 6=4320 7=480
 Split                    splitncnn_11             1 2 537 537_splitncnn_0 537_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
 Pooling                  545                      1 1 537_splitncnn_1 549 -23330=4,1,480,1,1 0=1 4=1
 InnerProduct             550                      1 1 549 551 -23330=4,1,120,1,1 0=120 1=1 2=57600 9=1
 InnerProduct             552                      1 1 551 552 -23330=4,1,480,1,1 0=480 1=1 2=57600
 HardSigmoid              557                      1 1 552 557 -23330=4,1,480,1,1
 BinaryOp                 566                      2 1 537_splitncnn_0 557 566 -23330=4,3,14,14,480 0=2
 HardSwish                572                      1 1 566 572 -23330=4,3,14,14,480
 Convolution              573                      1 1 572 573 -23330=4,3,14,14,112 0=112 1=1 5=1 6=53760
 Split                    splitncnn_12             1 2 573 573_splitncnn_0 573_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112
 Convolution              575                      1 1 573_splitncnn_1 575 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264
 HardSwish                582                      1 1 575 582 -23330=4,3,14,14,672
 ConvolutionDepthWise     583                      1 1 582 583 -23330=4,3,14,14,672 0=672 1=3 4=1 5=1 6=6048 7=672
 Split                    splitncnn_13             1 2 583 583_splitncnn_0 583_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
 Pooling                  591                      1 1 583_splitncnn_1 595 -23330=4,1,672,1,1 0=1 4=1
 InnerProduct             596                      1 1 595 597 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
 InnerProduct             598                      1 1 597 598 -23330=4,1,672,1,1 0=672 1=1 2=112896
 HardSigmoid              603                      1 1 598 603 -23330=4,1,672,1,1
 BinaryOp                 612                      2 1 583_splitncnn_0 603 612 -23330=4,3,14,14,672 0=2
 HardSwish                618                      1 1 612 618 -23330=4,3,14,14,672
 Convolution              619                      1 1 618 619 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264
 BinaryOp                 621                      2 1 573_splitncnn_0 619 621 -23330=4,3,14,14,112
 Convolution              622                      1 1 621 622 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264
 HardSwish                629                      1 1 622 629 -23330=4,3,14,14,672
 ConvolutionDepthWise     630                      1 1 629 630 -23330=4,3,14,14,672 0=672 1=5 4=2 5=1 6=16800 7=672
 Split                    splitncnn_14             1 2 630 630_splitncnn_0 630_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
 Pooling                  638                      1 1 630_splitncnn_1 642 -23330=4,1,672,1,1 0=1 4=1
 InnerProduct             643                      1 1 642 644 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
 InnerProduct             645                      1 1 644 645 -23330=4,1,672,1,1 0=672 1=1 2=112896
 HardSigmoid              650                      1 1 645 650 -23330=4,1,672,1,1
 BinaryOp                 659                      2 1 630_splitncnn_0 650 659 -23330=4,3,14,14,672 0=2
 HardSwish                665                      1 1 659 665 -23330=4,3,14,14,672
 Convolution              666                      1 1 665 666 -23330=4,3,14,14,160 0=160 1=1 5=1 6=107520
 Convolution              668                      1 1 666 668 -23330=4,3,14,14,672 0=672 1=1 5=1 6=107520
 HardSwish                675                      1 1 668 675 -23330=4,3,14,14,672
 ConvolutionDepthWise     676                      1 1 675 676 -23330=4,3,7,7,672 0=672 1=5 3=2 4=2 5=1 6=16800 7=672
 Split                    splitncnn_15             1 2 676 676_splitncnn_0 676_splitncnn_1 -23330=8,3,7,7,672,3,7,7,672
 Pooling                  684                      1 1 676_splitncnn_1 688 -23330=4,1,672,1,1 0=1 4=1
 InnerProduct             689                      1 1 688 690 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
 InnerProduct             691                      1 1 690 691 -23330=4,1,672,1,1 0=672 1=1 2=112896
 HardSigmoid              696                      1 1 691 696 -23330=4,1,672,1,1
 BinaryOp                 705                      2 1 676_splitncnn_0 696 705 -23330=4,3,7,7,672 0=2
 HardSwish                711                      1 1 705 711 -23330=4,3,7,7,672
 Convolution              712                      1 1 711 712 -23330=4,3,7,7,160 0=160 1=1 5=1 6=107520
 Split                    splitncnn_16             1 2 712 712_splitncnn_0 712_splitncnn_1 -23330=8,3,7,7,160,3,7,7,160
 Convolution              714                      1 1 712_splitncnn_1 714 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600
 HardSwish                721                      1 1 714 721 -23330=4,3,7,7,960
 ConvolutionDepthWise     722                      1 1 721 722 -23330=4,3,7,7,960 0=960 1=5 4=2 5=1 6=24000 7=960
 Split                    splitncnn_17             1 2 722 722_splitncnn_0 722_splitncnn_1 -23330=8,3,7,7,960,3,7,7,960
 Pooling                  730                      1 1 722_splitncnn_1 734 -23330=4,1,960,1,1 0=1 4=1
 InnerProduct             735                      1 1 734 736 -23330=4,1,240,1,1 0=240 1=1 2=230400 9=1
 InnerProduct             737                      1 1 736 737 -23330=4,1,960,1,1 0=960 1=1 2=230400
 HardSigmoid              742                      1 1 737 742 -23330=4,1,960,1,1
 BinaryOp                 751                      2 1 722_splitncnn_0 742 751 -23330=4,3,7,7,960 0=2
 HardSwish                757                      1 1 751 757 -23330=4,3,7,7,960
 Convolution              758                      1 1 757 758 -23330=4,3,7,7,160 0=160 1=1 5=1 6=153600
 BinaryOp                 760                      2 1 712_splitncnn_0 758 760 -23330=4,3,7,7,160
 Convolution              761                      1 1 760 761 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600
 HardSwish                768                      1 1 761 768 -23330=4,3,7,7,960
 Pooling                  769                      1 1 768 769 -23330=4,1,960,1,1 0=1 4=1
 HardSwish                775                      1 1 769 775 -23330=4,1,960,1,1
 Reshape                  783                      1 1 775 783 -23330=4,1,960,1,1 0=-1
 InnerProduct             784                      1 1 783 784 -23330=4,1,1280,1,1 0=1280 1=1 2=1228800
 HardSwish                790                      1 1 784 790 -23330=4,1,1280,1,1
 InnerProduct             791                      1 1 790 791 -23330=4,1,1000,1,1 0=1000 1=1 2=1280000
 Softmax                  prob                     1 1 791 output -23330=4,1,1000,1,1
 )delimiter";

 /**
 * @brief 使用一个简单的 Sigmoid 网络预热并测试 Pipeline Cache 的基本保存和加载功能
 * @return 0 on success, -1 on failure
 */
 static int warmup_gpu_pipecache()
 {
    std::cout << "==================================================" << std::endl;
    std::cout << "           Warmup: Testing Basic Cache IO         " << std::endl;
    std::cout << "==================================================" << std::endl;

    // 1. 创建一个网络，运行一次以生成 pipeline
    ncnn::Net net;
    net.opt.use_vulkan_compute = true;

    net.load_param_mem("7767517\n2 2\nInput    input0    0   1   input0\nSigmoid  sigmoid0  1   1   input0    output0");
    net.load_model((unsigned char*)""); // 用于创建 pipeline

    ncnn::Mat input0 = RandomMat(224, 224);
    ncnn::Mat output0;
    {
        ncnn::Extractor ex = net.create_extractor();
        ex.input("input0", input0);
        ex.extract("output0", output0);
    }

    if (output0.empty())
    {
        std::cerr << "Warmup failed: initial extraction failed." << std::endl;
        return -1;
    }

    // 2. 保存 pipeline cache
    const char* cache_path = "./sigmoid_pipecache.bin";
    if (net.opt.pipeline_cache->save_cache(cache_path) != 0)
    {
        std::cerr << "Warmup failed: could not save pipeline cache to " << cache_path << std::endl;
        return -1;
    }
    std::cout << "Warmup: Pipeline cache saved successfully." << std::endl;

    // 3. 创建第二个网络，加载刚才保存的 cache
    ncnn::Net net2;
    net2.opt.use_vulkan_compute = true;
    net2.opt.pipeline_cache = new ncnn::PipelineCache(net.vulkan_device());

    net2.load_param_mem("7767517\n2 2\nInput    input0    0   1   input0\nSigmoid  sigmoid0  1   1   input0    output0");
    if (net2.opt.pipeline_cache->load_cache(cache_path) != 0)
    {
        std::cerr << "Warmup failed: could not load pipeline cache from " << cache_path << std::endl;
        return -1;
    }
    std::cout << "Warmup: Pipeline cache loaded successfully." << std::endl;
    net2.load_model((unsigned char*)""); // 创建 pipeline

    // 4. 再次推理并验证结果是否一致
    ncnn::Mat output0_2;
    {
        ncnn::Extractor ex2 = net2.create_extractor();
        ex2.input("input0", input0);
        ex2.extract("output0", output0_2);
    }

    if (output0_2.empty())
    {
        std::cerr << "Warmup failed: extraction after loading cache failed." << std::endl;
        return -1;
    }

    if (CompareMat(output0, output0_2, 0.001) != 0)
    {
        std::cerr << "Warmup failed: output mismatch after loading cache." << std::endl;
        return -1;
    }

    std::cout << "Warmup PASSED: Outputs are identical." << std::endl;
    return 0;
 }

 /**
 * @brief 对比使用和不使用 Pipeline Cache 时的模型加载性能
 * @return 0 on success, -1 on failure
 */
 static int test_gpu_pipecache_performance()
 {
    ncnn::Mat output_no_cache;
    double time_no_cache = 0;

    const char* cache_path = "./mobilenet_pipecache.bin";
    DataReaderFromEmpty dr;
    ncnn::Mat input = RandomMat(224, 224, 3);

    // -------------------------------------------------
    // 1. 不使用 Pipeline Cache (首次加载)
    // -------------------------------------------------
    std::cout << "\n==================================================" << std::endl;
    std::cout << "       Performance Test: Without Pipeline Cache   " << std::endl;
    std::cout << "==================================================" << std::endl;
    {
        ncnn::Net net_no_cache;
        net_no_cache.opt.use_vulkan_compute = true;

        auto start = std::chrono::high_resolution_clock::now();

        net_no_cache.load_param_mem(mobilenet_v3_param);
        net_no_cache.load_model(dr);

        auto end = std::chrono::high_resolution_clock::now();
        time_no_cache = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(end - start).count();
        std::cout << "Model loading time without cache: " << time_no_cache << " ms" << std::endl;

        // 推理以获得基准输出
        ncnn::Extractor ex = net_no_cache.create_extractor();
        ex.input("data", input);
        ex.extract("output", output_no_cache);

        if (output_no_cache.empty())
        {
            std::cerr << "Test failed: extraction without cache failed." << std::endl;
            return -1;
        }

        // 保存 cache 以供下一步使用
        if (net_no_cache.opt.pipeline_cache->save_cache(cache_path) != 0)
        {
            std::cerr << "Test failed: could not save pipeline cache to " << cache_path << std::endl;
            return -1;
        }
        std::cout << "Pipeline cache generated and saved to " << cache_path << std::endl;
    }

    // -------------------------------------------------
    // 2. 使用 Pipeline Cache (二次加载)
    // -------------------------------------------------
    ncnn::Mat output_with_cache;
    double time_with_cache = 0;
    std::cout << "\n==================================================" << std::endl;
    std::cout << "        Performance Test: With Pipeline Cache     " << std::endl;
    std::cout << "==================================================" << std::endl;
    {
        ncnn::Net net_with_cache;
        // 必须在加载模型前设置好 cache
        net_with_cache.opt.pipeline_cache = new ncnn::PipelineCache(ncnn::get_gpu_device());
        net_with_cache.opt.use_vulkan_compute = true;

        auto start = std::chrono::high_resolution_clock::now();

        // 从文件加载 cache
        if (net_with_cache.opt.pipeline_cache->load_cache(cache_path) != 0)
        {
            std::cerr << "Test failed: could not load pipeline cache from " << cache_path << std::endl;
            return -1;
        }
        net_with_cache.load_param_mem(mobilenet_v3_param);
        net_with_cache.load_model(dr);

        auto end = std::chrono::high_resolution_clock::now();
        time_with_cache = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(end - start).count();
        std::cout << "Model loading time with cache: " << time_with_cache << " ms" << std::endl;

        // 推理
        ncnn::Extractor ex2 = net_with_cache.create_extractor();
        ex2.input("data", input);
        ex2.extract("output", output_with_cache);

        if (output_with_cache.empty())
        {
            std::cerr << "Test failed: extraction with cache failed." << std::endl;
            return -1;
        }
    }

    // -------------------------------------------------
    // 3. 结果验证与总结
    // -------------------------------------------------
    std::cout << "\n==================================================" << std::endl;
    std::cout << "              Verification and Summary            " << std::endl;
    std::cout << "==================================================" << std::endl;

    bool is_output_same = (CompareMat(output_no_cache, output_with_cache, 0.001) == 0);

    std::cout << "Output verification: " << (is_output_same ? "SUCCESS" : "FAILURE") << std::endl;
    std::cout << "--------------------------------------------------" << std::endl;
    std::cout << "Performance Summary:" << std::endl;
    std::cout << "  - Without Cache: " << time_no_cache << " ms" << std::endl;
    std::cout << "  - With Cache:    " << time_with_cache << " ms" << std::endl;

    if (time_no_cache > 0) {
        double speedup = (time_no_cache - time_with_cache) / time_no_cache * 100;
        std::cout << "  - Speedup:       " << speedup << "%" << std::endl;
    }

    if (!is_output_same)
    {
        std::cerr << "\nTest FAILED due to output mismatch." << std::endl;
        return -1;
    }

    std::cout << "\nTest PASSED." << std::endl;
    return 0;
 }

 int main()
 {
    // 运行预热测试，检查基本IO功能
    if (warmup_gpu_pipecache() != 0)
    {
        return -1;
    }

    // 运行性能对比测试
    return test_gpu_pipecache_performance();
 }