diff --git a/src/gpu.cpp b/src/gpu.cpp
index 00a711d09..06fe089ac 100644
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -98,6 +98,7 @@ struct layer_shader_registry_entry
 
 static const layer_shader_registry_entry layer_shader_registry[] = {
 #include "layer_shader_registry.h"
+
 };
 
 static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry);
@@ -476,16 +477,16 @@ void GpuInfoPrivate::query_properties()
     }
 
     if (physicalDeviceProperties.vendorID == 0x13b5
-            && (physicalDeviceProperties.deviceID == 0x7500001
-                || physicalDeviceProperties.deviceID == 0x7501000
-                || physicalDeviceProperties.deviceID == 0x8602000
-                || physicalDeviceProperties.deviceID == 0x8800020
-                || physicalDeviceProperties.deviceID == 0x70930000
-                || physicalDeviceProperties.deviceID == 0x70901010
-                || physicalDeviceProperties.deviceID == 0x72120000
-                || physicalDeviceProperties.deviceID == 0x74021000
-                || physicalDeviceProperties.deviceID == 0x60a00002
-                || physicalDeviceProperties.deviceID == 0x62210001))
+        && (physicalDeviceProperties.deviceID == 0x7500001
+            || physicalDeviceProperties.deviceID == 0x7501000
+            || physicalDeviceProperties.deviceID == 0x8602000
+            || physicalDeviceProperties.deviceID == 0x8800020
+            || physicalDeviceProperties.deviceID == 0x70930000
+            || physicalDeviceProperties.deviceID == 0x70901010
+            || physicalDeviceProperties.deviceID == 0x72120000
+            || physicalDeviceProperties.deviceID == 0x74021000
+            || physicalDeviceProperties.deviceID == 0x60a00002
+            || physicalDeviceProperties.deviceID == 0x62210001))
     {
         // NOTE rk3288/rk3399/t880/g31/g51/g52/g71/g72
         // however, g76/g77 has explicit fp16 arithmetic
@@ -494,9 +495,9 @@ void GpuInfoPrivate::query_properties()
     }
 
     if (physicalDeviceProperties.vendorID == 0x5143
-            && (physicalDeviceProperties.deviceID == 0x6030001
-                || physicalDeviceProperties.deviceID == 0x6040001
-                || physicalDeviceProperties.deviceID == 0x6050002))
+        && (physicalDeviceProperties.deviceID == 0x6030001
+            || physicalDeviceProperties.deviceID == 0x6040001
+            || physicalDeviceProperties.deviceID == 0x6050002))
     {
         // TODO enable devices other than qcom845/qcom855/qcom855plus/qcom865
         // qcom adreno driver accept spirv with fp16 arithmetic
@@ -512,7 +513,7 @@ static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyPropert
         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
 
         if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
-                && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
+            && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
         {
             return i;
         }
@@ -524,7 +525,7 @@ static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyPropert
         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
 
         if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
-                && (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
+            && (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
         {
             return i;
         }
@@ -553,8 +554,8 @@ static uint32_t find_device_transfer_queue(const std::vector<VkQueueFamilyProper
         const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
 
         if ((queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT)
-                && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
-                && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
+            && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
+            && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
         {
             return i;
         }
@@ -1137,30 +1138,30 @@ void GpuInfoPrivate::query_extension_properties()
             // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope);
 
             if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
-                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
-                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
-                    && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
+                && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
+                && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
+                && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
             {
                 support_cooperative_matrix_8_8_16 = true;
             }
             if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
-                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
-                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
-                    && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
+                && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
+                && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
+                && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
             {
                 support_cooperative_matrix_16_8_8 = true;
             }
             if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16
-                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
-                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
-                    && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
+                && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
+                && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
+                && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
             {
                 support_cooperative_matrix_16_8_16 = true;
             }
             if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16
-                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
-                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
-                    && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
+                && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
+                && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
+                && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
             {
                 support_cooperative_matrix_16_16_16 = true;
             }
@@ -1194,30 +1195,30 @@ void GpuInfoPrivate::query_extension_properties()
             // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope);
 
             if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
-                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
-                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
-                    && cmp.scope == VK_SCOPE_SUBGROUP_NV)
+                && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
+                && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
+                && cmp.scope == VK_SCOPE_SUBGROUP_NV)
             {
                 support_cooperative_matrix_8_8_16 = true;
             }
             if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
-                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
-                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
-                    && cmp.scope == VK_SCOPE_SUBGROUP_NV)
+                && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
+                && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
+                && cmp.scope == VK_SCOPE_SUBGROUP_NV)
             {
                 support_cooperative_matrix_16_8_8 = true;
             }
             if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16
-                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
-                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
-                    && cmp.scope == VK_SCOPE_SUBGROUP_NV)
+                && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
+                && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
+                && cmp.scope == VK_SCOPE_SUBGROUP_NV)
             {
                 support_cooperative_matrix_16_8_16 = true;
             }
             if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16
-                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
-                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
-                    && cmp.scope == VK_SCOPE_SUBGROUP_NV)
+                && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
+                && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
+                && cmp.scope == VK_SCOPE_SUBGROUP_NV)
             {
                 support_cooperative_matrix_16_16_16 = true;
             }
@@ -2031,8 +2032,8 @@ void GpuInfo::get_optimal_cooperative_matrix_mnk(int M, int N, int K, VkComponen
             const VkCooperativeMatrixPropertiesKHR& cmp = d->queryCooperativeMatrixSubProperties[i];
 
             if (cmp.AType == type && cmp.BType == type
-                    && cmp.CType == acctype && cmp.ResultType == acctype
-                    && cmp.scope == scope)
+                && cmp.CType == acctype && cmp.ResultType == acctype
+                && cmp.scope == scope)
             {
                 mnk_properties.push_back(cmp);
             }
@@ -2045,8 +2046,8 @@ void GpuInfo::get_optimal_cooperative_matrix_mnk(int M, int N, int K, VkComponen
             const VkCooperativeMatrixPropertiesNV& cmp = d->queryCooperativeMatrixSubPropertiesNV[i];
 
             if (cmp.AType == (VkComponentTypeNV)type && cmp.BType == (VkComponentTypeNV)type
-                    && cmp.CType == (VkComponentTypeNV)acctype && cmp.DType == (VkComponentTypeNV)acctype
-                    && cmp.scope == (VkScopeNV)scope)
+                && cmp.CType == (VkComponentTypeNV)acctype && cmp.DType == (VkComponentTypeNV)acctype
+                && cmp.scope == (VkScopeNV)scope)
             {
                 VkCooperativeMatrixPropertiesKHR cmp_khr;
                 cmp_khr.MSize = cmp.MSize;
@@ -2459,7 +2460,7 @@ int create_gpu_instance(const char* driver_path)
 #endif // __ANDROID_API__ >= 26
 
     uint32_t instance_api_version = VK_MAKE_VERSION(1, 0, 0);
-    typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t * pApiVersion);
+    typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t* pApiVersion);
     PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion = (PFN_vkEnumerateInstanceVersion)vkGetInstanceProcAddr(0, "vkEnumerateInstanceVersion");
     if (vkEnumerateInstanceVersion)
     {
@@ -2672,7 +2673,7 @@ int create_gpu_instance(const char* driver_path)
                         fp16_matrix_properties.push_back(cmp);
                 }
                 if ((cmp.AType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.AType == VK_COMPONENT_TYPE_SINT8_PACKED_NV)
-                        && (cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.BType == VK_COMPONENT_TYPE_SINT8_PACKED_NV))
+                    && (cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.BType == VK_COMPONENT_TYPE_SINT8_PACKED_NV))
                 {
                     bool mnk_hit = false;
                     for (size_t k = 0; k < int8_matrix_properties.size(); k++)
@@ -2703,9 +2704,9 @@ int create_gpu_instance(const char* driver_path)
                         bf16_matrix_properties.push_back(cmp);
                 }
                 if ((cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT
-                        || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV)
-                        && (cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT
-                            || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV))
+                     || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV)
+                    && (cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT
+                        || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV))
                 {
                     bool mnk_hit = false;
                     for (size_t k = 0; k < fp8_matrix_properties.size(); k++)
@@ -3143,8 +3144,9 @@ const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_
     uop->vkdev = vkdev;
 
     ncnn::ParamDict pd;
-    pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 : 8); // out_elempack
-    pd.set(2, cast_type_from_index + 1);                                            // 0=auto 1=fp32 2=fp16 3=int8
+    pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4
+                                                                          : 8); // out_elempack
+    pd.set(2, cast_type_from_index + 1);                                        // 0=auto 1=fp32 2=fp16 3=int8
     pd.set(3, cast_type_to_index + 1);
 
     uop->load_param(pd);
@@ -3734,7 +3736,7 @@ int VulkanDevice::create_pipeline_layout(int push_constant_count, VkDescriptorSe
     return 0;
 }
 
-int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline) const
+int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline, VkPipelineCache pipeline_cache) const
 {
     const int specialization_count = specializations.size();
 
@@ -3792,7 +3794,7 @@ int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout
     computePipelineCreateInfo.basePipelineHandle = 0;
     computePipelineCreateInfo.basePipelineIndex = 0;
 
-    VkResult ret = vkCreateComputePipelines(d->device, 0, 1, &computePipelineCreateInfo, 0, pipeline);
+    VkResult ret = vkCreateComputePipelines(d->device, pipeline_cache, 1, &computePipelineCreateInfo, 0, pipeline);
     if (ret != VK_SUCCESS)
     {
         NCNN_LOGE("vkCreateComputePipelines failed %d", ret);
@@ -3871,6 +3873,18 @@ int VulkanDevice::create_descriptor_update_template(int binding_count, const int
     return 0;
 }
 
+int VulkanDevice::create_pipeline_cache(const VkPipelineCacheCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkPipelineCache* pPipelineCache) const
+{
+    VkResult ret = vkCreatePipelineCache(d->device, pCreateInfo, pAllocator, pPipelineCache);
+    if (ret != VK_SUCCESS)
+    {
+        NCNN_LOGE("vkCreatePipelineCache failed %d", ret);
+        return -1;
+    }
+
+    return 0;
+}
+
 uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const
 {
     const VkPhysicalDeviceMemoryProperties& memory_properties = info.physicalDeviceMemoryProperties();
@@ -3883,8 +3897,8 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ
         {
             const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
             if ((memoryType.propertyFlags & required) == required
-                    && (preferred && (memoryType.propertyFlags & preferred))
-                    && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
+                && (preferred && (memoryType.propertyFlags & preferred))
+                && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
             {
                 return i;
             }
@@ -3899,7 +3913,7 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ
         {
             const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
             if ((memoryType.propertyFlags & required) == required
-                    && (preferred && (memoryType.propertyFlags & preferred)))
+                && (preferred && (memoryType.propertyFlags & preferred)))
             {
                 return i;
             }
@@ -3914,7 +3928,7 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ
         {
             const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
             if ((memoryType.propertyFlags & required) == required
-                    && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
+                && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
             {
                 return i;
             }
@@ -4222,7 +4236,8 @@ void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempac
 
 void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, int cast_type_to, VkCompute& cmd, const Option& opt) const
 {
-    int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
+    int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1
+                                                                          : 2;
 
     int cast_type_from_index;
     if (src.elembits() == 32)
diff --git a/src/gpu.h b/src/gpu.h
index 7863b2e21..c3256339b 100644
--- a/src/gpu.h
+++ b/src/gpu.h
@@ -419,8 +419,9 @@ public:
     // helper for creating pipeline
     int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
     int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
-    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline) const;
+    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline, VkPipelineCache pipeline_cache = 0) const;
     int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+    int create_pipeline_cache(const VkPipelineCacheCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkPipelineCache* pPipelineCache) const;
 
     uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
     bool is_mappable(uint32_t memory_type_index) const;
diff --git a/src/pipelinecache.cpp b/src/pipelinecache.cpp
index 1bd274514..cde9ee9ca 100644
--- a/src/pipelinecache.cpp
+++ b/src/pipelinecache.cpp
@@ -110,13 +110,42 @@ public:
         ShaderInfo shader_info; // TODO use pointer ?
     };
 
+    struct spv_param
+    {
+        union
+        {
+            struct
+            {
+                int32_t shader_type_index;
+                uint32_t opt_bits;
+            };
+            uint64_t d0;
+        };
+    };
+
+    struct pipeline_cache_header
+    {
+        uint32_t magic = 0x5a545546;
+        uint32_t vendorID;          // VkPhysicalDeviceProperties::vendorID
+        uint32_t deviceID;          // VkPhysicalDeviceProperties::deviceID
+        uint32_t driverVersion;     // VkPhysicalDeviceProperties::driverVersion
+        uint8_t uuid[VK_UUID_SIZE]; // VkPhysicalDeviceProperties::pipelineCacheUUID
+
+        uint32_t spv_size; // size of spirv data
+        uint32_t pipeline_cache_size;
+    };
+
     mutable std::vector<pipeline_cache_digest> cache_digests;
     mutable std::vector<pipeline_cache_artifact> cache_artifacts;
+
+    VkPipelineCache vk_pipeline_cache;
+    mutable std::vector<std::pair<spv_param, std::vector<uint32_t> > > cache_spirv_module; // digest(index,opt) -> spirv data
+
     mutable Mutex cache_lock;
 };
 
 PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
-        uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
+                                                                   uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
 {
     spv_data_murmur3 = murmur3_32(spv_data, spv_data_size / 4);
 
@@ -134,7 +163,7 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_
 }
 
 PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(int _shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
-        uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
+                                                                   uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size)
 {
     shader_type_index = _shader_type_index;
 
@@ -160,6 +189,18 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(int _shader_t
 PipelineCache::PipelineCache(const VulkanDevice* _vkdev)
     : vkdev(_vkdev), d(new PipelineCachePrivate)
 {
+    VkPipelineCacheCreateInfo pipelineCacheCreateInfo{};
+    pipelineCacheCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO;
+    pipelineCacheCreateInfo.initialDataSize = 0; // zeros for empty cache
+    pipelineCacheCreateInfo.pInitialData = nullptr;
+
+    int ret = 0;
+    ret = _vkdev->create_pipeline_cache(&pipelineCacheCreateInfo, 0, &d->vk_pipeline_cache);
+    if (ret != 0)
+    {
+        NCNN_LOGE("create_pipeline_cache failed %d", ret);
+        d->vk_pipeline_cache = 0;
+    }
 }
 
 PipelineCache::~PipelineCache()
@@ -381,18 +422,288 @@ int PipelineCache::get_pipeline(int shader_type_index, const Option& opt, const
     return 0;
 }
 
+int PipelineCache::save_cache(std::vector<unsigned char>& buf) const
+{
+    if (!vkdev)
+    {
+        NCNN_LOGE("vkdev is null");
+        return -1;
+    }
+    MutexLockGuard lock(d->cache_lock);
+
+    PipelineCachePrivate::pipeline_cache_header header;
+
+    // Platform information
+    header.vendorID = vkdev->info.vendor_id();
+    header.deviceID = vkdev->info.device_id();
+    header.driverVersion = vkdev->info.driver_version();
+    memcpy(header.uuid, vkdev->info.pipeline_cache_uuid(), VK_UUID_SIZE);
+
+    header.spv_size = d->cache_spirv_module.size();
+
+    size_t buf_size = 0;
+    if (vkGetPipelineCacheData(vkdev->vkdevice(), d->vk_pipeline_cache, &buf_size, nullptr) != VK_SUCCESS)
+    {
+        NCNN_LOGE("vkGetPipelineCacheData failed");
+        return -1;
+    }
+    header.pipeline_cache_size = (uint32_t)buf_size;
+
+    std::vector<unsigned char> pipe_data(header.pipeline_cache_size);
+    if (vkGetPipelineCacheData(vkdev->vkdevice(), d->vk_pipeline_cache, &buf_size, pipe_data.data()) != VK_SUCCESS)
+    {
+        NCNN_LOGE("vkGetPipelineCacheData failed");
+        return -1;
+    }
+
+    buf.resize(sizeof(header));
+    memcpy(buf.data(), &header, sizeof(header));
+
+    // spv_digest and spv_data
+    for (size_t i = 0; i < d->cache_spirv_module.size(); i++)
+    {
+        const PipelineCachePrivate::spv_param& sd = d->cache_spirv_module[i].first;
+        const std::vector<uint32_t>& spv_data = d->cache_spirv_module[i].second;
+        uint32_t size = (uint32_t)spv_data.size();
+
+        size_t current_buf_size = buf.size();
+        buf.resize(current_buf_size + sizeof(sd) + sizeof(size) + spv_data.size() * sizeof(uint32_t));
+
+        memcpy(buf.data() + current_buf_size, &sd, sizeof(sd));
+        current_buf_size += sizeof(sd);
+        memcpy(buf.data() + current_buf_size, &size, sizeof(size));
+        current_buf_size += sizeof(size);
+
+        memcpy(buf.data() + current_buf_size, spv_data.data(), spv_data.size() * sizeof(uint32_t));
+    }
+
+    buf.insert(buf.end(), pipe_data.begin(), pipe_data.end());
+    return 0;
+}
+
+int PipelineCache::load_cache(const std::vector<unsigned char>& buf) const
+{
+    if (!vkdev)
+    {
+        NCNN_LOGE("vkdev is null");
+        return -1;
+    }
+    MutexLockGuard lock(d->cache_lock);
+
+    // Corrected struct name to pipeline_cache_header (lowercase h)
+    if (buf.size() < sizeof(PipelineCachePrivate::pipeline_cache_header))
+    {
+        NCNN_LOGE("Invalid cache buffer size: too small for header");
+        return -1;
+    }
+
+    PipelineCachePrivate::pipeline_cache_header header;
+    memcpy(&header, buf.data(), sizeof(header));
+
+    // Validate magic number
+    if (header.magic != 0x5a545546)
+    {
+        NCNN_LOGE("Invalid cache magic number");
+        return -1;
+    }
+
+    // Validate platform information for compatibility
+    if (header.vendorID != vkdev->info.vendor_id() || header.deviceID != vkdev->info.device_id() || header.driverVersion != vkdev->info.driver_version() || memcmp(header.uuid, vkdev->info.pipeline_cache_uuid(), VK_UUID_SIZE) != 0)
+    {
+        NCNN_LOGE("Cache platform mismatch, might be incompatible.");
+        return -1;
+    }
+
+    size_t current_offset = sizeof(header);
+
+    // Load SPIR-V data and associated spv_param
+    d->cache_spirv_module.reserve(header.spv_size);
+
+    for (uint32_t i = 0; i < header.spv_size; ++i)
+    {
+        if (current_offset + sizeof(PipelineCachePrivate::spv_param) + sizeof(uint32_t) > buf.size())
+        {
+            NCNN_LOGE("Invalid cache buffer size: incomplete spv_param or size for entry %u", i);
+            return -1;
+        }
+
+        PipelineCachePrivate::spv_param sd;
+        memcpy(&sd, buf.data() + current_offset, sizeof(sd));
+        current_offset += sizeof(sd);
+
+        uint32_t spv_vec_size_uint32; // Size in uint32_t units
+        memcpy(&spv_vec_size_uint32, buf.data() + current_offset, sizeof(spv_vec_size_uint32));
+        current_offset += sizeof(spv_vec_size_uint32);
+
+        size_t spv_data_byte_size = spv_vec_size_uint32 * sizeof(uint32_t);
+
+        if (current_offset + spv_data_byte_size > buf.size())
+        {
+            NCNN_LOGE("Invalid cache buffer size: incomplete spv_data for entry %u", i);
+            return -1;
+        }
+
+        std::vector<uint32_t> spirv_data(spv_vec_size_uint32);
+        memcpy(spirv_data.data(), buf.data() + current_offset, spv_data_byte_size);
+        current_offset += spv_data_byte_size;
+
+        d->cache_spirv_module.push_back({sd, spirv_data});
+    }
+
+    // Load Vulkan Pipeline Cache Data
+    if (current_offset + header.pipeline_cache_size > buf.size())
+    {
+        NCNN_LOGE("Invalid cache buffer size: incomplete pipeline cache data");
+        return -1;
+    }
+
+    if (d->vk_pipeline_cache)
+    {
+        vkDestroyPipelineCache(vkdev->vkdevice(), d->vk_pipeline_cache, 0);
+        d->vk_pipeline_cache = 0;
+    }
+
+    VkPipelineCacheCreateInfo pipelineCacheCreateInfo{};
+    pipelineCacheCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO;
+    pipelineCacheCreateInfo.initialDataSize = header.pipeline_cache_size;
+    pipelineCacheCreateInfo.pInitialData = buf.data() + current_offset;
+
+    int ret = vkdev->create_pipeline_cache(&pipelineCacheCreateInfo, 0, &d->vk_pipeline_cache);
+    if (ret != 0)
+    {
+        NCNN_LOGE("create_pipeline_cache with initial data failed %d", ret);
+        d->vk_pipeline_cache = 0;
+        return -1;
+    }
+
+    return 0;
+}
+
+int PipelineCache::save_cache(FILE* fp) const
+{
+    if (!fp)
+    {
+        NCNN_LOGE("Invalid FILE pointer for saving cache.");
+        return -1;
+    }
+
+    std::vector<unsigned char> buf;
+    int ret = save_cache(buf);
+    if (ret != 0)
+    {
+        NCNN_LOGE("Failed to get cache data into buffer for saving to file.");
+        return ret;
+    }
+
+    if (fwrite(buf.data(), 1, buf.size(), fp) != buf.size())
+    {
+        NCNN_LOGE("Failed to write cache data to file.");
+        return -1;
+    }
+
+    return 0;
+}
+
+int PipelineCache::load_cache(FILE* fp) const
+{
+    if (!fp)
+    {
+        NCNN_LOGE("Invalid FILE pointer for loading cache.");
+        return -1;
+    }
+
+    fseek(fp, 0, SEEK_END);
+    long file_size = ftell(fp);
+    fseek(fp, 0, SEEK_SET);
+
+    if (file_size < 0)
+    {
+        NCNN_LOGE("Failed to determine file size for loading cache.");
+        return -1;
+    }
+
+    std::vector<unsigned char> buf(file_size);
+    if (fread(buf.data(), 1, file_size, fp) != (size_t)file_size)
+    {
+        NCNN_LOGE("Failed to read cache data from file.");
+        return -1;
+    }
+
+    return load_cache(buf);
+}
+
+int PipelineCache::save_cache(const char* filename) const
+{
+    if (!filename)
+    {
+        NCNN_LOGE("Invalid filename for saving cache.");
+        return -1;
+    }
+
+    FILE* fp = fopen(filename, "wb");
+    if (!fp)
+    {
+        NCNN_LOGE("Failed to open file %s for writing cache.", filename);
+        return -1;
+    }
+
+    int ret = save_cache(fp);
+    fclose(fp);
+
+    return ret;
+}
+
+int PipelineCache::load_cache(const char* filename) const
+{
+    if (!filename)
+    {
+        NCNN_LOGE("Invalid filename for loading cache.");
+        return -1;
+    }
+
+    FILE* fp = fopen(filename, "rb");
+    if (!fp)
+    {
+        NCNN_LOGE("Failed to open file %s for reading cache.", filename);
+        return -1;
+    }
+
+    int ret = load_cache(fp);
+    fclose(fp);
+
+    return ret;
+}
+
 int PipelineCache::create_shader_module(int shader_type_index, const Option& opt,
                                         uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
                                         VkShaderModule* _shader_module, ShaderInfo& si) const
 {
+    uint32_t opt_bits = 0 << 7
+                        | opt.use_fp16_packed << 6
+                        | opt.use_fp16_storage << 5
+                        | opt.use_fp16_arithmetic << 4
+                        | opt.use_int8_storage << 3
+                        | opt.use_int8_arithmetic << 2;
+
     std::vector<uint32_t> spirv;
+
+    for (int i = 0; i < d->cache_spirv_module.size(); i++)
+    {
+        if (d->cache_spirv_module[i].first.d0 == PipelineCachePrivate::spv_param({shader_type_index, opt_bits}).d0) // hit cache
+        {
+            spirv = d->cache_spirv_module[i].second;
+            goto hit_cache;
+        }
+    }
+
     int retc = compile_spirv_module(shader_type_index, opt, spirv);
     if (retc != 0)
     {
         NCNN_LOGE("compile_spirv_module failed %d", retc);
         return -1;
     }
-
+    d->cache_spirv_module.push_back({{shader_type_index, opt_bits}, spirv});
+hit_cache:
     const uint32_t* spv_data = spirv.data();
     size_t spv_data_size = spirv.size() * 4;
 
@@ -445,7 +756,7 @@ int PipelineCache::new_pipeline(VkShaderModule shader_module, const ShaderInfo&
     if (ret != 0)
         goto ERROR_PipelineCache;
 
-    ret = vkdev->create_pipeline(shader_module, pipeline_layout, specializations, subgroup_size, &pipeline);
+    ret = vkdev->create_pipeline(shader_module, pipeline_layout, specializations, subgroup_size, &pipeline, d->vk_pipeline_cache);
     if (ret != 0)
         goto ERROR_PipelineCache;
 
diff --git a/src/pipelinecache.h b/src/pipelinecache.h
index b93c0cfd8..15086e268 100644
--- a/src/pipelinecache.h
+++ b/src/pipelinecache.h
@@ -42,6 +42,16 @@ public:
                      VkDescriptorUpdateTemplateKHR* descriptor_update_template,
                      ShaderInfo& shader_info) const;
 
+    int save_cache(std::vector<unsigned char> &buf) const;
+    int load_cache(const std::vector<unsigned char>& buf) const;
+
+#ifdef NCNN_STDIO
+    int save_cache(FILE* fp) const;
+    int load_cache(FILE* fp) const;
+    int save_cache(const char* fp) const;
+    int load_cache(const char* fp) const;
+#endif
+
 protected:
     int create_shader_module(int shader_type_index, const Option& opt,
                              uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 9d5b6517e..0c4849091 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -65,6 +65,7 @@ ncnn_add_test(paramdict)
 
 if(NCNN_VULKAN)
     ncnn_add_test(command)
+    ncnn_add_test(pipecache)
 endif()
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
diff --git a/tests/test_pipecache.cpp b/tests/test_pipecache.cpp
new file mode 100644
index 000000000..acbcabe53
--- /dev/null
+++ b/tests/test_pipecache.cpp
@@ -0,0 +1,405 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "datareader.h"
+#include "gpu.h"
+#include "mat.h"
+#include "net.h"
+#include "pipelinecache.h"
+#include "testutil.h"
+
+#include <iostream>
+#include <chrono>
+#include <vector>
+
+// 一个空数据读取器，用于加载模型结构，权重将全部为0
+class DataReaderFromEmpty : public ncnn::DataReader
+{
+public:
+    virtual int scan(const char* format, void* p) const
+    {
+        (void)format; // unused
+        (void)p;      // unused
+        return 0;
+    }
+    virtual size_t read(void* buf, size_t size) const
+    {
+        memset(buf, 0, size);
+        return size;
+    }
+};
+
+// MobileNetV3 的网络结构参数
+static const char* mobilenet_v3_param = R"delimiter(
+7767517
+145 163
+Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
+Convolution              313                      1 1 data 313 -23330=4,3,112,112,16 0=16 1=3 3=2 4=1 5=1 6=432
+Split                    splitncnn_0              1 2 313 313_splitncnn_0 313_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16
+HardSigmoid              319                      1 1 313_splitncnn_1 319 -23330=4,3,112,112,16
+BinaryOp                 320                      2 1 313_splitncnn_0 319 320 -23330=4,3,112,112,16 0=2
+Split                    splitncnn_1              1 2 320 320_splitncnn_0 320_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16
+ConvolutionDepthWise     321                      1 1 320_splitncnn_1 323 -23330=4,3,112,112,16 0=16 1=3 4=1 5=1 6=144 7=16 9=1
+Convolution              324                      1 1 323 324 -23330=4,3,112,112,16 0=16 1=1 5=1 6=256
+BinaryOp                 326                      2 1 320_splitncnn_0 324 326 -23330=4,3,112,112,16
+Convolution              327                      1 1 326 329 -23330=4,3,112,112,64 0=64 1=1 5=1 6=1024 9=1
+ConvolutionDepthWise     330                      1 1 329 332 -23330=4,3,56,56,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 9=1
+Convolution              333                      1 1 332 333 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1536
+Split                    splitncnn_2              1 2 333 333_splitncnn_0 333_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24
+Convolution              335                      1 1 333_splitncnn_1 337 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1
+ConvolutionDepthWise     338                      1 1 337 340 -23330=4,3,56,56,72 0=72 1=3 4=1 5=1 6=648 7=72 9=1
+Convolution              341                      1 1 340 341 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1728
+BinaryOp                 343                      2 1 333_splitncnn_0 341 343 -23330=4,3,56,56,24
+Convolution              344                      1 1 343 346 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1
+ConvolutionDepthWise     347                      1 1 346 347 -23330=4,3,28,28,72 0=72 1=5 3=2 4=2 5=1 6=1800 7=72
+Split                    splitncnn_3              1 2 347 347_splitncnn_0 347_splitncnn_1 -23330=8,3,28,28,72,3,28,28,72
+Pooling                  355                      1 1 347_splitncnn_1 359 -23330=4,1,72,1,1 0=1 4=1
+InnerProduct             360                      1 1 359 361 -23330=4,1,18,1,1 0=18 1=1 2=1296 9=1
+InnerProduct             362                      1 1 361 362 -23330=4,1,72,1,1 0=72 1=1 2=1296
+HardSigmoid              367                      1 1 362 367 -23330=4,1,72,1,1
+BinaryOp                 376                      2 1 347_splitncnn_0 367 376 -23330=4,3,28,28,72 0=2
+ReLU                     377                      1 1 376 377 -23330=4,3,28,28,72
+Convolution              378                      1 1 377 378 -23330=4,3,28,28,40 0=40 1=1 5=1 6=2880
+Split                    splitncnn_4              1 2 378 378_splitncnn_0 378_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
+Convolution              380                      1 1 378_splitncnn_1 382 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
+ConvolutionDepthWise     383                      1 1 382 383 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120
+Split                    splitncnn_5              1 2 383 383_splitncnn_0 383_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120
+Pooling                  391                      1 1 383_splitncnn_1 395 -23330=4,1,120,1,1 0=1 4=1
+InnerProduct             396                      1 1 395 397 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1
+InnerProduct             398                      1 1 397 398 -23330=4,1,120,1,1 0=120 1=1 2=3600
+HardSigmoid              403                      1 1 398 403 -23330=4,1,120,1,1
+BinaryOp                 412                      2 1 383_splitncnn_0 403 412 -23330=4,3,28,28,120 0=2
+ReLU                     413                      1 1 412 413 -23330=4,3,28,28,120
+Convolution              414                      1 1 413 414 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
+BinaryOp                 416                      2 1 378_splitncnn_0 414 416 -23330=4,3,28,28,40
+Split                    splitncnn_6              1 2 416 416_splitncnn_0 416_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
+Convolution              417                      1 1 416_splitncnn_1 419 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
+ConvolutionDepthWise     420                      1 1 419 420 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120
+Split                    splitncnn_7              1 2 420 420_splitncnn_0 420_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120
+Pooling                  428                      1 1 420_splitncnn_1 432 -23330=4,1,120,1,1 0=1 4=1
+InnerProduct             433                      1 1 432 434 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1
+InnerProduct             435                      1 1 434 435 -23330=4,1,120,1,1 0=120 1=1 2=3600
+HardSigmoid              440                      1 1 435 440 -23330=4,1,120,1,1
+BinaryOp                 449                      2 1 420_splitncnn_0 440 449 -23330=4,3,28,28,120 0=2
+ReLU                     450                      1 1 449 450 -23330=4,3,28,28,120
+Convolution              451                      1 1 450 451 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
+BinaryOp                 453                      2 1 416_splitncnn_0 451 453 -23330=4,3,28,28,40
+Convolution              454                      1 1 453 454 -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600
+HardSwish                461                      1 1 454 461 -23330=4,3,28,28,240
+ConvolutionDepthWise     462                      1 1 461 462 -23330=4,3,14,14,240 0=240 1=3 3=2 4=1 5=1 6=2160 7=240
+HardSwish                469                      1 1 462 469 -23330=4,3,14,14,240
+Convolution              470                      1 1 469 470 -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200
+Split                    splitncnn_8              1 2 470 470_splitncnn_0 470_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
+Convolution              472                      1 1 470_splitncnn_1 472 -23330=4,3,14,14,200 0=200 1=1 5=1 6=16000
+HardSwish                479                      1 1 472 479 -23330=4,3,14,14,200
+ConvolutionDepthWise     480                      1 1 479 480 -23330=4,3,14,14,200 0=200 1=3 4=1 5=1 6=1800 7=200
+HardSwish                487                      1 1 480 487 -23330=4,3,14,14,200
+Convolution              488                      1 1 487 488 -23330=4,3,14,14,80 0=80 1=1 5=1 6=16000
+BinaryOp                 490                      2 1 470_splitncnn_0 488 490 -23330=4,3,14,14,80
+Split                    splitncnn_9              1 2 490 490_splitncnn_0 490_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
+Convolution              491                      1 1 490_splitncnn_1 491 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720
+HardSwish                498                      1 1 491 498 -23330=4,3,14,14,184
+ConvolutionDepthWise     499                      1 1 498 499 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184
+HardSwish                506                      1 1 499 506 -23330=4,3,14,14,184
+Convolution              507                      1 1 506 507 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720
+BinaryOp                 509                      2 1 490_splitncnn_0 507 509 -23330=4,3,14,14,80
+Split                    splitncnn_10             1 2 509 509_splitncnn_0 509_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
+Convolution              510                      1 1 509_splitncnn_1 510 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720
+HardSwish                517                      1 1 510 517 -23330=4,3,14,14,184
+ConvolutionDepthWise     518                      1 1 517 518 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184
+HardSwish                525                      1 1 518 525 -23330=4,3,14,14,184
+Convolution              526                      1 1 525 526 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720
+BinaryOp                 528                      2 1 509_splitncnn_0 526 528 -23330=4,3,14,14,80
+Convolution              529                      1 1 528 529 -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400
+HardSwish                536                      1 1 529 536 -23330=4,3,14,14,480
+ConvolutionDepthWise     537                      1 1 536 537 -23330=4,3,14,14,480 0=480 1=3 4=1 5=1 6=4320 7=480
+Split                    splitncnn_11             1 2 537 537_splitncnn_0 537_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
+Pooling                  545                      1 1 537_splitncnn_1 549 -23330=4,1,480,1,1 0=1 4=1
+InnerProduct             550                      1 1 549 551 -23330=4,1,120,1,1 0=120 1=1 2=57600 9=1
+InnerProduct             552                      1 1 551 552 -23330=4,1,480,1,1 0=480 1=1 2=57600
+HardSigmoid              557                      1 1 552 557 -23330=4,1,480,1,1
+BinaryOp                 566                      2 1 537_splitncnn_0 557 566 -23330=4,3,14,14,480 0=2
+HardSwish                572                      1 1 566 572 -23330=4,3,14,14,480
+Convolution              573                      1 1 572 573 -23330=4,3,14,14,112 0=112 1=1 5=1 6=53760
+Split                    splitncnn_12             1 2 573 573_splitncnn_0 573_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112
+Convolution              575                      1 1 573_splitncnn_1 575 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264
+HardSwish                582                      1 1 575 582 -23330=4,3,14,14,672
+ConvolutionDepthWise     583                      1 1 582 583 -23330=4,3,14,14,672 0=672 1=3 4=1 5=1 6=6048 7=672
+Split                    splitncnn_13             1 2 583 583_splitncnn_0 583_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
+Pooling                  591                      1 1 583_splitncnn_1 595 -23330=4,1,672,1,1 0=1 4=1
+InnerProduct             596                      1 1 595 597 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
+InnerProduct             598                      1 1 597 598 -23330=4,1,672,1,1 0=672 1=1 2=112896
+HardSigmoid              603                      1 1 598 603 -23330=4,1,672,1,1
+BinaryOp                 612                      2 1 583_splitncnn_0 603 612 -23330=4,3,14,14,672 0=2
+HardSwish                618                      1 1 612 618 -23330=4,3,14,14,672
+Convolution              619                      1 1 618 619 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264
+BinaryOp                 621                      2 1 573_splitncnn_0 619 621 -23330=4,3,14,14,112
+Convolution              622                      1 1 621 622 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264
+HardSwish                629                      1 1 622 629 -23330=4,3,14,14,672
+ConvolutionDepthWise     630                      1 1 629 630 -23330=4,3,14,14,672 0=672 1=5 4=2 5=1 6=16800 7=672
+Split                    splitncnn_14             1 2 630 630_splitncnn_0 630_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
+Pooling                  638                      1 1 630_splitncnn_1 642 -23330=4,1,672,1,1 0=1 4=1
+InnerProduct             643                      1 1 642 644 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
+InnerProduct             645                      1 1 644 645 -23330=4,1,672,1,1 0=672 1=1 2=112896
+HardSigmoid              650                      1 1 645 650 -23330=4,1,672,1,1
+BinaryOp                 659                      2 1 630_splitncnn_0 650 659 -23330=4,3,14,14,672 0=2
+HardSwish                665                      1 1 659 665 -23330=4,3,14,14,672
+Convolution              666                      1 1 665 666 -23330=4,3,14,14,160 0=160 1=1 5=1 6=107520
+Convolution              668                      1 1 666 668 -23330=4,3,14,14,672 0=672 1=1 5=1 6=107520
+HardSwish                675                      1 1 668 675 -23330=4,3,14,14,672
+ConvolutionDepthWise     676                      1 1 675 676 -23330=4,3,7,7,672 0=672 1=5 3=2 4=2 5=1 6=16800 7=672
+Split                    splitncnn_15             1 2 676 676_splitncnn_0 676_splitncnn_1 -23330=8,3,7,7,672,3,7,7,672
+Pooling                  684                      1 1 676_splitncnn_1 688 -23330=4,1,672,1,1 0=1 4=1
+InnerProduct             689                      1 1 688 690 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
+InnerProduct             691                      1 1 690 691 -23330=4,1,672,1,1 0=672 1=1 2=112896
+HardSigmoid              696                      1 1 691 696 -23330=4,1,672,1,1
+BinaryOp                 705                      2 1 676_splitncnn_0 696 705 -23330=4,3,7,7,672 0=2
+HardSwish                711                      1 1 705 711 -23330=4,3,7,7,672
+Convolution              712                      1 1 711 712 -23330=4,3,7,7,160 0=160 1=1 5=1 6=107520
+Split                    splitncnn_16             1 2 712 712_splitncnn_0 712_splitncnn_1 -23330=8,3,7,7,160,3,7,7,160
+Convolution              714                      1 1 712_splitncnn_1 714 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600
+HardSwish                721                      1 1 714 721 -23330=4,3,7,7,960
+ConvolutionDepthWise     722                      1 1 721 722 -23330=4,3,7,7,960 0=960 1=5 4=2 5=1 6=24000 7=960
+Split                    splitncnn_17             1 2 722 722_splitncnn_0 722_splitncnn_1 -23330=8,3,7,7,960,3,7,7,960
+Pooling                  730                      1 1 722_splitncnn_1 734 -23330=4,1,960,1,1 0=1 4=1
+InnerProduct             735                      1 1 734 736 -23330=4,1,240,1,1 0=240 1=1 2=230400 9=1
+InnerProduct             737                      1 1 736 737 -23330=4,1,960,1,1 0=960 1=1 2=230400
+HardSigmoid              742                      1 1 737 742 -23330=4,1,960,1,1
+BinaryOp                 751                      2 1 722_splitncnn_0 742 751 -23330=4,3,7,7,960 0=2
+HardSwish                757                      1 1 751 757 -23330=4,3,7,7,960
+Convolution              758                      1 1 757 758 -23330=4,3,7,7,160 0=160 1=1 5=1 6=153600
+BinaryOp                 760                      2 1 712_splitncnn_0 758 760 -23330=4,3,7,7,160
+Convolution              761                      1 1 760 761 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600
+HardSwish                768                      1 1 761 768 -23330=4,3,7,7,960
+Pooling                  769                      1 1 768 769 -23330=4,1,960,1,1 0=1 4=1
+HardSwish                775                      1 1 769 775 -23330=4,1,960,1,1
+Reshape                  783                      1 1 775 783 -23330=4,1,960,1,1 0=-1
+InnerProduct             784                      1 1 783 784 -23330=4,1,1280,1,1 0=1280 1=1 2=1228800
+HardSwish                790                      1 1 784 790 -23330=4,1,1280,1,1
+InnerProduct             791                      1 1 790 791 -23330=4,1,1000,1,1 0=1000 1=1 2=1280000
+Softmax                  prob                     1 1 791 output -23330=4,1,1000,1,1
+)delimiter";
+
+/**
+ * @brief 使用一个简单的 Sigmoid 网络预热并测试 Pipeline Cache 的基本保存和加载功能
+ * @return 0 on success, -1 on failure
+ */
+static int warmup_gpu_pipecache()
+{
+    std::cout << "==================================================" << std::endl;
+    std::cout << "           Warmup: Testing Basic Cache IO         " << std::endl;
+    std::cout << "==================================================" << std::endl;
+
+    // 1. 创建一个网络，运行一次以生成 pipeline
+    ncnn::Net net;
+    net.opt.use_vulkan_compute = true;
+
+    net.load_param_mem("7767517\n2 2\nInput    input0    0   1   input0\nSigmoid  sigmoid0  1   1   input0    output0");
+    net.load_model((unsigned char*)""); // 用于创建 pipeline
+
+    ncnn::Mat input0 = RandomMat(224, 224);
+    ncnn::Mat output0;
+    {
+        ncnn::Extractor ex = net.create_extractor();
+        ex.input("input0", input0);
+        ex.extract("output0", output0);
+    }
+
+    if (output0.empty())
+    {
+        std::cerr << "Warmup failed: initial extraction failed." << std::endl;
+        return -1;
+    }
+
+    // 2. 保存 pipeline cache
+    const char* cache_path = "./sigmoid_pipecache.bin";
+    if (net.opt.pipeline_cache->save_cache(cache_path) != 0)
+    {
+        std::cerr << "Warmup failed: could not save pipeline cache to " << cache_path << std::endl;
+        return -1;
+    }
+    std::cout << "Warmup: Pipeline cache saved successfully." << std::endl;
+
+    // 3. 创建第二个网络，加载刚才保存的 cache
+    ncnn::Net net2;
+    net2.opt.use_vulkan_compute = true;
+    net2.opt.pipeline_cache = new ncnn::PipelineCache(net.vulkan_device());
+
+    net2.load_param_mem("7767517\n2 2\nInput    input0    0   1   input0\nSigmoid  sigmoid0  1   1   input0    output0");
+    if (net2.opt.pipeline_cache->load_cache(cache_path) != 0)
+    {
+        std::cerr << "Warmup failed: could not load pipeline cache from " << cache_path << std::endl;
+        return -1;
+    }
+    std::cout << "Warmup: Pipeline cache loaded successfully." << std::endl;
+    net2.load_model((unsigned char*)""); // 创建 pipeline
+
+    // 4. 再次推理并验证结果是否一致
+    ncnn::Mat output0_2;
+    {
+        ncnn::Extractor ex2 = net2.create_extractor();
+        ex2.input("input0", input0);
+        ex2.extract("output0", output0_2);
+    }
+
+    if (output0_2.empty())
+    {
+        std::cerr << "Warmup failed: extraction after loading cache failed." << std::endl;
+        return -1;
+    }
+
+    if (CompareMat(output0, output0_2, 0.001) != 0)
+    {
+        std::cerr << "Warmup failed: output mismatch after loading cache." << std::endl;
+        return -1;
+    }
+
+    std::cout << "Warmup PASSED: Outputs are identical." << std::endl;
+    return 0;
+}
+
+/**
+ * @brief 对比使用和不使用 Pipeline Cache 时的模型加载性能
+ * @return 0 on success, -1 on failure
+ */
+static int test_gpu_pipecache_performance()
+{
+    ncnn::Mat output_no_cache;
+    double time_no_cache = 0;
+
+    const char* cache_path = "./mobilenet_pipecache.bin";
+    DataReaderFromEmpty dr;
+    ncnn::Mat input = RandomMat(224, 224, 3);
+
+    // -------------------------------------------------
+    // 1. 不使用 Pipeline Cache (首次加载)
+    // -------------------------------------------------
+    std::cout << "\n==================================================" << std::endl;
+    std::cout << "       Performance Test: Without Pipeline Cache   " << std::endl;
+    std::cout << "==================================================" << std::endl;
+    {
+        ncnn::Net net_no_cache;
+        net_no_cache.opt.use_vulkan_compute = true;
+
+        auto start = std::chrono::high_resolution_clock::now();
+
+        net_no_cache.load_param_mem(mobilenet_v3_param);
+        net_no_cache.load_model(dr);
+
+        auto end = std::chrono::high_resolution_clock::now();
+        time_no_cache = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(end - start).count();
+        std::cout << "Model loading time without cache: " << time_no_cache << " ms" << std::endl;
+
+        // 推理以获得基准输出
+        ncnn::Extractor ex = net_no_cache.create_extractor();
+        ex.input("data", input);
+        ex.extract("output", output_no_cache);
+
+        if (output_no_cache.empty())
+        {
+            std::cerr << "Test failed: extraction without cache failed." << std::endl;
+            return -1;
+        }
+
+        // 保存 cache 以供下一步使用
+        if (net_no_cache.opt.pipeline_cache->save_cache(cache_path) != 0)
+        {
+            std::cerr << "Test failed: could not save pipeline cache to " << cache_path << std::endl;
+            return -1;
+        }
+        std::cout << "Pipeline cache generated and saved to " << cache_path << std::endl;
+    }
+
+    // -------------------------------------------------
+    // 2. 使用 Pipeline Cache (二次加载)
+    // -------------------------------------------------
+    ncnn::Mat output_with_cache;
+    double time_with_cache = 0;
+    std::cout << "\n==================================================" << std::endl;
+    std::cout << "        Performance Test: With Pipeline Cache     " << std::endl;
+    std::cout << "==================================================" << std::endl;
+    {
+        ncnn::Net net_with_cache;
+        // 必须在加载模型前设置好 cache
+        net_with_cache.opt.pipeline_cache = new ncnn::PipelineCache(ncnn::get_gpu_device());
+        net_with_cache.opt.use_vulkan_compute = true;
+
+        auto start = std::chrono::high_resolution_clock::now();
+
+        // 从文件加载 cache
+        if (net_with_cache.opt.pipeline_cache->load_cache(cache_path) != 0)
+        {
+            std::cerr << "Test failed: could not load pipeline cache from " << cache_path << std::endl;
+            return -1;
+        }
+        net_with_cache.load_param_mem(mobilenet_v3_param);
+        net_with_cache.load_model(dr);
+
+        auto end = std::chrono::high_resolution_clock::now();
+        time_with_cache = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(end - start).count();
+        std::cout << "Model loading time with cache: " << time_with_cache << " ms" << std::endl;
+
+        // 推理
+        ncnn::Extractor ex2 = net_with_cache.create_extractor();
+        ex2.input("data", input);
+        ex2.extract("output", output_with_cache);
+
+        if (output_with_cache.empty())
+        {
+            std::cerr << "Test failed: extraction with cache failed." << std::endl;
+            return -1;
+        }
+    }
+
+    // -------------------------------------------------
+    // 3. 结果验证与总结
+    // -------------------------------------------------
+    std::cout << "\n==================================================" << std::endl;
+    std::cout << "              Verification and Summary            " << std::endl;
+    std::cout << "==================================================" << std::endl;
+
+    bool is_output_same = (CompareMat(output_no_cache, output_with_cache, 0.001) == 0);
+
+    std::cout << "Output verification: " << (is_output_same ? "SUCCESS" : "FAILURE") << std::endl;
+    std::cout << "--------------------------------------------------" << std::endl;
+    std::cout << "Performance Summary:" << std::endl;
+    std::cout << "  - Without Cache: " << time_no_cache << " ms" << std::endl;
+    std::cout << "  - With Cache:    " << time_with_cache << " ms" << std::endl;
+
+    if (time_no_cache > 0) {
+        double speedup = (time_no_cache - time_with_cache) / time_no_cache * 100;
+        std::cout << "  - Speedup:       " << speedup << "%" << std::endl;
+    }
+
+    if (!is_output_same)
+    {
+        std::cerr << "\nTest FAILED due to output mismatch." << std::endl;
+        return -1;
+    }
+
+    std::cout << "\nTest PASSED." << std::endl;
+    return 0;
+}
+
+int main()
+{
+    // 运行预热测试，检查基本IO功能
+    if (warmup_gpu_pipecache() != 0)
+    {
+        return -1;
+    }
+
+    // 运行性能对比测试
+    return test_gpu_pipecache_performance();
+}