diff --git a/src/gpu.cpp b/src/gpu.cpp index 00a711d09..06fe089ac 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -98,6 +98,7 @@ struct layer_shader_registry_entry static const layer_shader_registry_entry layer_shader_registry[] = { #include "layer_shader_registry.h" + }; static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry); @@ -476,16 +477,16 @@ void GpuInfoPrivate::query_properties() } if (physicalDeviceProperties.vendorID == 0x13b5 - && (physicalDeviceProperties.deviceID == 0x7500001 - || physicalDeviceProperties.deviceID == 0x7501000 - || physicalDeviceProperties.deviceID == 0x8602000 - || physicalDeviceProperties.deviceID == 0x8800020 - || physicalDeviceProperties.deviceID == 0x70930000 - || physicalDeviceProperties.deviceID == 0x70901010 - || physicalDeviceProperties.deviceID == 0x72120000 - || physicalDeviceProperties.deviceID == 0x74021000 - || physicalDeviceProperties.deviceID == 0x60a00002 - || physicalDeviceProperties.deviceID == 0x62210001)) + && (physicalDeviceProperties.deviceID == 0x7500001 + || physicalDeviceProperties.deviceID == 0x7501000 + || physicalDeviceProperties.deviceID == 0x8602000 + || physicalDeviceProperties.deviceID == 0x8800020 + || physicalDeviceProperties.deviceID == 0x70930000 + || physicalDeviceProperties.deviceID == 0x70901010 + || physicalDeviceProperties.deviceID == 0x72120000 + || physicalDeviceProperties.deviceID == 0x74021000 + || physicalDeviceProperties.deviceID == 0x60a00002 + || physicalDeviceProperties.deviceID == 0x62210001)) { // NOTE rk3288/rk3399/t880/g31/g51/g52/g71/g72 // however, g76/g77 has explicit fp16 arithmetic @@ -494,9 +495,9 @@ void GpuInfoPrivate::query_properties() } if (physicalDeviceProperties.vendorID == 0x5143 - && (physicalDeviceProperties.deviceID == 0x6030001 - || physicalDeviceProperties.deviceID == 0x6040001 - || physicalDeviceProperties.deviceID == 0x6050002)) + && (physicalDeviceProperties.deviceID == 0x6030001 + || physicalDeviceProperties.deviceID == 0x6040001 + || physicalDeviceProperties.deviceID == 0x6050002)) { // TODO enable devices other than qcom845/qcom855/qcom855plus/qcom865 // qcom adreno driver accept spirv with fp16 arithmetic @@ -512,7 +513,7 @@ static uint32_t find_device_compute_queue(const std::vectorqueryCooperativeMatrixSubProperties[i]; if (cmp.AType == type && cmp.BType == type - && cmp.CType == acctype && cmp.ResultType == acctype - && cmp.scope == scope) + && cmp.CType == acctype && cmp.ResultType == acctype + && cmp.scope == scope) { mnk_properties.push_back(cmp); } @@ -2045,8 +2046,8 @@ void GpuInfo::get_optimal_cooperative_matrix_mnk(int M, int N, int K, VkComponen const VkCooperativeMatrixPropertiesNV& cmp = d->queryCooperativeMatrixSubPropertiesNV[i]; if (cmp.AType == (VkComponentTypeNV)type && cmp.BType == (VkComponentTypeNV)type - && cmp.CType == (VkComponentTypeNV)acctype && cmp.DType == (VkComponentTypeNV)acctype - && cmp.scope == (VkScopeNV)scope) + && cmp.CType == (VkComponentTypeNV)acctype && cmp.DType == (VkComponentTypeNV)acctype + && cmp.scope == (VkScopeNV)scope) { VkCooperativeMatrixPropertiesKHR cmp_khr; cmp_khr.MSize = cmp.MSize; @@ -2459,7 +2460,7 @@ int create_gpu_instance(const char* driver_path) #endif // __ANDROID_API__ >= 26 uint32_t instance_api_version = VK_MAKE_VERSION(1, 0, 0); - typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t * pApiVersion); + typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t* pApiVersion); PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion = (PFN_vkEnumerateInstanceVersion)vkGetInstanceProcAddr(0, "vkEnumerateInstanceVersion"); if (vkEnumerateInstanceVersion) { @@ -2672,7 +2673,7 @@ int create_gpu_instance(const char* driver_path) fp16_matrix_properties.push_back(cmp); } if ((cmp.AType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.AType == VK_COMPONENT_TYPE_SINT8_PACKED_NV) - && (cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.BType == VK_COMPONENT_TYPE_SINT8_PACKED_NV)) + && (cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.BType == VK_COMPONENT_TYPE_SINT8_PACKED_NV)) { bool mnk_hit = false; for (size_t k = 0; k < int8_matrix_properties.size(); k++) @@ -2703,9 +2704,9 @@ int create_gpu_instance(const char* driver_path) bf16_matrix_properties.push_back(cmp); } if ((cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT - || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV) - && (cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT - || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV)) + || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV) + && (cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT + || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV)) { bool mnk_hit = false; for (size_t k = 0; k < fp8_matrix_properties.size(); k++) @@ -3143,8 +3144,9 @@ const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_ uop->vkdev = vkdev; ncnn::ParamDict pd; - pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 : 8); // out_elempack - pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16 3=int8 + pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 + : 8); // out_elempack + pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16 3=int8 pd.set(3, cast_type_to_index + 1); uop->load_param(pd); @@ -3734,7 +3736,7 @@ int VulkanDevice::create_pipeline_layout(int push_constant_count, VkDescriptorSe return 0; } -int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector& specializations, uint32_t subgroup_size, VkPipeline* pipeline) const +int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector& specializations, uint32_t subgroup_size, VkPipeline* pipeline, VkPipelineCache pipeline_cache) const { const int specialization_count = specializations.size(); @@ -3792,7 +3794,7 @@ int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout computePipelineCreateInfo.basePipelineHandle = 0; computePipelineCreateInfo.basePipelineIndex = 0; - VkResult ret = vkCreateComputePipelines(d->device, 0, 1, &computePipelineCreateInfo, 0, pipeline); + VkResult ret = vkCreateComputePipelines(d->device, pipeline_cache, 1, &computePipelineCreateInfo, 0, pipeline); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateComputePipelines failed %d", ret); @@ -3871,6 +3873,18 @@ int VulkanDevice::create_descriptor_update_template(int binding_count, const int return 0; } +int VulkanDevice::create_pipeline_cache(const VkPipelineCacheCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkPipelineCache* pPipelineCache) const +{ + VkResult ret = vkCreatePipelineCache(d->device, pCreateInfo, pAllocator, pPipelineCache); + if (ret != VK_SUCCESS) + { + NCNN_LOGE("vkCreatePipelineCache failed %d", ret); + return -1; + } + + return 0; +} + uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const { const VkPhysicalDeviceMemoryProperties& memory_properties = info.physicalDeviceMemoryProperties(); @@ -3883,8 +3897,8 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ { const VkMemoryType& memoryType = memory_properties.memoryTypes[i]; if ((memoryType.propertyFlags & required) == required - && (preferred && (memoryType.propertyFlags & preferred)) - && (preferred_not && !(memoryType.propertyFlags & preferred_not))) + && (preferred && (memoryType.propertyFlags & preferred)) + && (preferred_not && !(memoryType.propertyFlags & preferred_not))) { return i; } @@ -3899,7 +3913,7 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ { const VkMemoryType& memoryType = memory_properties.memoryTypes[i]; if ((memoryType.propertyFlags & required) == required - && (preferred && (memoryType.propertyFlags & preferred))) + && (preferred && (memoryType.propertyFlags & preferred))) { return i; } @@ -3914,7 +3928,7 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ { const VkMemoryType& memoryType = memory_properties.memoryTypes[i]; if ((memoryType.propertyFlags & required) == required - && (preferred_not && !(memoryType.propertyFlags & preferred_not))) + && (preferred_not && !(memoryType.propertyFlags & preferred_not))) { return i; } @@ -4222,7 +4236,8 @@ void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempac void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, int cast_type_to, VkCompute& cmd, const Option& opt) const { - int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2; + int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 + : 2; int cast_type_from_index; if (src.elembits() == 32) diff --git a/src/gpu.h b/src/gpu.h index 7863b2e21..c3256339b 100644 --- a/src/gpu.h +++ b/src/gpu.h @@ -419,8 +419,9 @@ public: // helper for creating pipeline int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const; int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const; - int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector& specializations, uint32_t subgroup_size, VkPipeline* pipeline) const; + int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector& specializations, uint32_t subgroup_size, VkPipeline* pipeline, VkPipelineCache pipeline_cache = 0) const; int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const; + int create_pipeline_cache(const VkPipelineCacheCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkPipelineCache* pPipelineCache) const; uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const; bool is_mappable(uint32_t memory_type_index) const; diff --git a/src/pipelinecache.cpp b/src/pipelinecache.cpp index 1bd274514..cde9ee9ca 100644 --- a/src/pipelinecache.cpp +++ b/src/pipelinecache.cpp @@ -110,13 +110,42 @@ public: ShaderInfo shader_info; // TODO use pointer ? }; + struct spv_param + { + union + { + struct + { + int32_t shader_type_index; + uint32_t opt_bits; + }; + uint64_t d0; + }; + }; + + struct pipeline_cache_header + { + uint32_t magic = 0x5a545546; + uint32_t vendorID; // VkPhysicalDeviceProperties::vendorID + uint32_t deviceID; // VkPhysicalDeviceProperties::deviceID + uint32_t driverVersion; // VkPhysicalDeviceProperties::driverVersion + uint8_t uuid[VK_UUID_SIZE]; // VkPhysicalDeviceProperties::pipelineCacheUUID + + uint32_t spv_size; // size of spirv data + uint32_t pipeline_cache_size; + }; + mutable std::vector cache_digests; mutable std::vector cache_artifacts; + + VkPipelineCache vk_pipeline_cache; + mutable std::vector > > cache_spirv_module; // digest(index,opt) -> spirv data + mutable Mutex cache_lock; }; PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_t* spv_data, size_t spv_data_size, const std::vector& specializations, - uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size) + uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size) { spv_data_murmur3 = murmur3_32(spv_data, spv_data_size / 4); @@ -134,7 +163,7 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_ } PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(int _shader_type_index, const Option& opt, const std::vector& specializations, - uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size) + uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size) { shader_type_index = _shader_type_index; @@ -160,6 +189,18 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(int _shader_t PipelineCache::PipelineCache(const VulkanDevice* _vkdev) : vkdev(_vkdev), d(new PipelineCachePrivate) { + VkPipelineCacheCreateInfo pipelineCacheCreateInfo{}; + pipelineCacheCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO; + pipelineCacheCreateInfo.initialDataSize = 0; // zeros for empty cache + pipelineCacheCreateInfo.pInitialData = nullptr; + + int ret = 0; + ret = _vkdev->create_pipeline_cache(&pipelineCacheCreateInfo, 0, &d->vk_pipeline_cache); + if (ret != 0) + { + NCNN_LOGE("create_pipeline_cache failed %d", ret); + d->vk_pipeline_cache = 0; + } } PipelineCache::~PipelineCache() @@ -381,18 +422,288 @@ int PipelineCache::get_pipeline(int shader_type_index, const Option& opt, const return 0; } +int PipelineCache::save_cache(std::vector& buf) const +{ + if (!vkdev) + { + NCNN_LOGE("vkdev is null"); + return -1; + } + MutexLockGuard lock(d->cache_lock); + + PipelineCachePrivate::pipeline_cache_header header; + + // Platform information + header.vendorID = vkdev->info.vendor_id(); + header.deviceID = vkdev->info.device_id(); + header.driverVersion = vkdev->info.driver_version(); + memcpy(header.uuid, vkdev->info.pipeline_cache_uuid(), VK_UUID_SIZE); + + header.spv_size = d->cache_spirv_module.size(); + + size_t buf_size = 0; + if (vkGetPipelineCacheData(vkdev->vkdevice(), d->vk_pipeline_cache, &buf_size, nullptr) != VK_SUCCESS) + { + NCNN_LOGE("vkGetPipelineCacheData failed"); + return -1; + } + header.pipeline_cache_size = (uint32_t)buf_size; + + std::vector pipe_data(header.pipeline_cache_size); + if (vkGetPipelineCacheData(vkdev->vkdevice(), d->vk_pipeline_cache, &buf_size, pipe_data.data()) != VK_SUCCESS) + { + NCNN_LOGE("vkGetPipelineCacheData failed"); + return -1; + } + + buf.resize(sizeof(header)); + memcpy(buf.data(), &header, sizeof(header)); + + // spv_digest and spv_data + for (size_t i = 0; i < d->cache_spirv_module.size(); i++) + { + const PipelineCachePrivate::spv_param& sd = d->cache_spirv_module[i].first; + const std::vector& spv_data = d->cache_spirv_module[i].second; + uint32_t size = (uint32_t)spv_data.size(); + + size_t current_buf_size = buf.size(); + buf.resize(current_buf_size + sizeof(sd) + sizeof(size) + spv_data.size() * sizeof(uint32_t)); + + memcpy(buf.data() + current_buf_size, &sd, sizeof(sd)); + current_buf_size += sizeof(sd); + memcpy(buf.data() + current_buf_size, &size, sizeof(size)); + current_buf_size += sizeof(size); + + memcpy(buf.data() + current_buf_size, spv_data.data(), spv_data.size() * sizeof(uint32_t)); + } + + buf.insert(buf.end(), pipe_data.begin(), pipe_data.end()); + return 0; +} + +int PipelineCache::load_cache(const std::vector& buf) const +{ + if (!vkdev) + { + NCNN_LOGE("vkdev is null"); + return -1; + } + MutexLockGuard lock(d->cache_lock); + + // Corrected struct name to pipeline_cache_header (lowercase h) + if (buf.size() < sizeof(PipelineCachePrivate::pipeline_cache_header)) + { + NCNN_LOGE("Invalid cache buffer size: too small for header"); + return -1; + } + + PipelineCachePrivate::pipeline_cache_header header; + memcpy(&header, buf.data(), sizeof(header)); + + // Validate magic number + if (header.magic != 0x5a545546) + { + NCNN_LOGE("Invalid cache magic number"); + return -1; + } + + // Validate platform information for compatibility + if (header.vendorID != vkdev->info.vendor_id() || header.deviceID != vkdev->info.device_id() || header.driverVersion != vkdev->info.driver_version() || memcmp(header.uuid, vkdev->info.pipeline_cache_uuid(), VK_UUID_SIZE) != 0) + { + NCNN_LOGE("Cache platform mismatch, might be incompatible."); + return -1; + } + + size_t current_offset = sizeof(header); + + // Load SPIR-V data and associated spv_param + d->cache_spirv_module.reserve(header.spv_size); + + for (uint32_t i = 0; i < header.spv_size; ++i) + { + if (current_offset + sizeof(PipelineCachePrivate::spv_param) + sizeof(uint32_t) > buf.size()) + { + NCNN_LOGE("Invalid cache buffer size: incomplete spv_param or size for entry %u", i); + return -1; + } + + PipelineCachePrivate::spv_param sd; + memcpy(&sd, buf.data() + current_offset, sizeof(sd)); + current_offset += sizeof(sd); + + uint32_t spv_vec_size_uint32; // Size in uint32_t units + memcpy(&spv_vec_size_uint32, buf.data() + current_offset, sizeof(spv_vec_size_uint32)); + current_offset += sizeof(spv_vec_size_uint32); + + size_t spv_data_byte_size = spv_vec_size_uint32 * sizeof(uint32_t); + + if (current_offset + spv_data_byte_size > buf.size()) + { + NCNN_LOGE("Invalid cache buffer size: incomplete spv_data for entry %u", i); + return -1; + } + + std::vector spirv_data(spv_vec_size_uint32); + memcpy(spirv_data.data(), buf.data() + current_offset, spv_data_byte_size); + current_offset += spv_data_byte_size; + + d->cache_spirv_module.push_back({sd, spirv_data}); + } + + // Load Vulkan Pipeline Cache Data + if (current_offset + header.pipeline_cache_size > buf.size()) + { + NCNN_LOGE("Invalid cache buffer size: incomplete pipeline cache data"); + return -1; + } + + if (d->vk_pipeline_cache) + { + vkDestroyPipelineCache(vkdev->vkdevice(), d->vk_pipeline_cache, 0); + d->vk_pipeline_cache = 0; + } + + VkPipelineCacheCreateInfo pipelineCacheCreateInfo{}; + pipelineCacheCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO; + pipelineCacheCreateInfo.initialDataSize = header.pipeline_cache_size; + pipelineCacheCreateInfo.pInitialData = buf.data() + current_offset; + + int ret = vkdev->create_pipeline_cache(&pipelineCacheCreateInfo, 0, &d->vk_pipeline_cache); + if (ret != 0) + { + NCNN_LOGE("create_pipeline_cache with initial data failed %d", ret); + d->vk_pipeline_cache = 0; + return -1; + } + + return 0; +} + +int PipelineCache::save_cache(FILE* fp) const +{ + if (!fp) + { + NCNN_LOGE("Invalid FILE pointer for saving cache."); + return -1; + } + + std::vector buf; + int ret = save_cache(buf); + if (ret != 0) + { + NCNN_LOGE("Failed to get cache data into buffer for saving to file."); + return ret; + } + + if (fwrite(buf.data(), 1, buf.size(), fp) != buf.size()) + { + NCNN_LOGE("Failed to write cache data to file."); + return -1; + } + + return 0; +} + +int PipelineCache::load_cache(FILE* fp) const +{ + if (!fp) + { + NCNN_LOGE("Invalid FILE pointer for loading cache."); + return -1; + } + + fseek(fp, 0, SEEK_END); + long file_size = ftell(fp); + fseek(fp, 0, SEEK_SET); + + if (file_size < 0) + { + NCNN_LOGE("Failed to determine file size for loading cache."); + return -1; + } + + std::vector buf(file_size); + if (fread(buf.data(), 1, file_size, fp) != (size_t)file_size) + { + NCNN_LOGE("Failed to read cache data from file."); + return -1; + } + + return load_cache(buf); +} + +int PipelineCache::save_cache(const char* filename) const +{ + if (!filename) + { + NCNN_LOGE("Invalid filename for saving cache."); + return -1; + } + + FILE* fp = fopen(filename, "wb"); + if (!fp) + { + NCNN_LOGE("Failed to open file %s for writing cache.", filename); + return -1; + } + + int ret = save_cache(fp); + fclose(fp); + + return ret; +} + +int PipelineCache::load_cache(const char* filename) const +{ + if (!filename) + { + NCNN_LOGE("Invalid filename for loading cache."); + return -1; + } + + FILE* fp = fopen(filename, "rb"); + if (!fp) + { + NCNN_LOGE("Failed to open file %s for reading cache.", filename); + return -1; + } + + int ret = load_cache(fp); + fclose(fp); + + return ret; +} + int PipelineCache::create_shader_module(int shader_type_index, const Option& opt, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, VkShaderModule* _shader_module, ShaderInfo& si) const { + uint32_t opt_bits = 0 << 7 + | opt.use_fp16_packed << 6 + | opt.use_fp16_storage << 5 + | opt.use_fp16_arithmetic << 4 + | opt.use_int8_storage << 3 + | opt.use_int8_arithmetic << 2; + std::vector spirv; + + for (int i = 0; i < d->cache_spirv_module.size(); i++) + { + if (d->cache_spirv_module[i].first.d0 == PipelineCachePrivate::spv_param({shader_type_index, opt_bits}).d0) // hit cache + { + spirv = d->cache_spirv_module[i].second; + goto hit_cache; + } + } + int retc = compile_spirv_module(shader_type_index, opt, spirv); if (retc != 0) { NCNN_LOGE("compile_spirv_module failed %d", retc); return -1; } - + d->cache_spirv_module.push_back({{shader_type_index, opt_bits}, spirv}); +hit_cache: const uint32_t* spv_data = spirv.data(); size_t spv_data_size = spirv.size() * 4; @@ -445,7 +756,7 @@ int PipelineCache::new_pipeline(VkShaderModule shader_module, const ShaderInfo& if (ret != 0) goto ERROR_PipelineCache; - ret = vkdev->create_pipeline(shader_module, pipeline_layout, specializations, subgroup_size, &pipeline); + ret = vkdev->create_pipeline(shader_module, pipeline_layout, specializations, subgroup_size, &pipeline, d->vk_pipeline_cache); if (ret != 0) goto ERROR_PipelineCache; diff --git a/src/pipelinecache.h b/src/pipelinecache.h index b93c0cfd8..15086e268 100644 --- a/src/pipelinecache.h +++ b/src/pipelinecache.h @@ -42,6 +42,16 @@ public: VkDescriptorUpdateTemplateKHR* descriptor_update_template, ShaderInfo& shader_info) const; + int save_cache(std::vector &buf) const; + int load_cache(const std::vector& buf) const; + +#ifdef NCNN_STDIO + int save_cache(FILE* fp) const; + int load_cache(FILE* fp) const; + int save_cache(const char* fp) const; + int load_cache(const char* fp) const; +#endif + protected: int create_shader_module(int shader_type_index, const Option& opt, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9d5b6517e..0c4849091 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -65,6 +65,7 @@ ncnn_add_test(paramdict) if(NCNN_VULKAN) ncnn_add_test(command) + ncnn_add_test(pipecache) endif() if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") diff --git a/tests/test_pipecache.cpp b/tests/test_pipecache.cpp new file mode 100644 index 000000000..acbcabe53 --- /dev/null +++ b/tests/test_pipecache.cpp @@ -0,0 +1,405 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "datareader.h" +#include "gpu.h" +#include "mat.h" +#include "net.h" +#include "pipelinecache.h" +#include "testutil.h" + +#include +#include +#include + +// 一个空数据读取器,用于加载模型结构,权重将全部为0 +class DataReaderFromEmpty : public ncnn::DataReader +{ +public: + virtual int scan(const char* format, void* p) const + { + (void)format; // unused + (void)p; // unused + return 0; + } + virtual size_t read(void* buf, size_t size) const + { + memset(buf, 0, size); + return size; + } +}; + +// MobileNetV3 的网络结构参数 +static const char* mobilenet_v3_param = R"delimiter( +7767517 +145 163 +Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 +Convolution 313 1 1 data 313 -23330=4,3,112,112,16 0=16 1=3 3=2 4=1 5=1 6=432 +Split splitncnn_0 1 2 313 313_splitncnn_0 313_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16 +HardSigmoid 319 1 1 313_splitncnn_1 319 -23330=4,3,112,112,16 +BinaryOp 320 2 1 313_splitncnn_0 319 320 -23330=4,3,112,112,16 0=2 +Split splitncnn_1 1 2 320 320_splitncnn_0 320_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16 +ConvolutionDepthWise 321 1 1 320_splitncnn_1 323 -23330=4,3,112,112,16 0=16 1=3 4=1 5=1 6=144 7=16 9=1 +Convolution 324 1 1 323 324 -23330=4,3,112,112,16 0=16 1=1 5=1 6=256 +BinaryOp 326 2 1 320_splitncnn_0 324 326 -23330=4,3,112,112,16 +Convolution 327 1 1 326 329 -23330=4,3,112,112,64 0=64 1=1 5=1 6=1024 9=1 +ConvolutionDepthWise 330 1 1 329 332 -23330=4,3,56,56,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 9=1 +Convolution 333 1 1 332 333 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1536 +Split splitncnn_2 1 2 333 333_splitncnn_0 333_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24 +Convolution 335 1 1 333_splitncnn_1 337 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1 +ConvolutionDepthWise 338 1 1 337 340 -23330=4,3,56,56,72 0=72 1=3 4=1 5=1 6=648 7=72 9=1 +Convolution 341 1 1 340 341 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1728 +BinaryOp 343 2 1 333_splitncnn_0 341 343 -23330=4,3,56,56,24 +Convolution 344 1 1 343 346 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1 +ConvolutionDepthWise 347 1 1 346 347 -23330=4,3,28,28,72 0=72 1=5 3=2 4=2 5=1 6=1800 7=72 +Split splitncnn_3 1 2 347 347_splitncnn_0 347_splitncnn_1 -23330=8,3,28,28,72,3,28,28,72 +Pooling 355 1 1 347_splitncnn_1 359 -23330=4,1,72,1,1 0=1 4=1 +InnerProduct 360 1 1 359 361 -23330=4,1,18,1,1 0=18 1=1 2=1296 9=1 +InnerProduct 362 1 1 361 362 -23330=4,1,72,1,1 0=72 1=1 2=1296 +HardSigmoid 367 1 1 362 367 -23330=4,1,72,1,1 +BinaryOp 376 2 1 347_splitncnn_0 367 376 -23330=4,3,28,28,72 0=2 +ReLU 377 1 1 376 377 -23330=4,3,28,28,72 +Convolution 378 1 1 377 378 -23330=4,3,28,28,40 0=40 1=1 5=1 6=2880 +Split splitncnn_4 1 2 378 378_splitncnn_0 378_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40 +Convolution 380 1 1 378_splitncnn_1 382 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1 +ConvolutionDepthWise 383 1 1 382 383 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120 +Split splitncnn_5 1 2 383 383_splitncnn_0 383_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120 +Pooling 391 1 1 383_splitncnn_1 395 -23330=4,1,120,1,1 0=1 4=1 +InnerProduct 396 1 1 395 397 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1 +InnerProduct 398 1 1 397 398 -23330=4,1,120,1,1 0=120 1=1 2=3600 +HardSigmoid 403 1 1 398 403 -23330=4,1,120,1,1 +BinaryOp 412 2 1 383_splitncnn_0 403 412 -23330=4,3,28,28,120 0=2 +ReLU 413 1 1 412 413 -23330=4,3,28,28,120 +Convolution 414 1 1 413 414 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800 +BinaryOp 416 2 1 378_splitncnn_0 414 416 -23330=4,3,28,28,40 +Split splitncnn_6 1 2 416 416_splitncnn_0 416_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40 +Convolution 417 1 1 416_splitncnn_1 419 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1 +ConvolutionDepthWise 420 1 1 419 420 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120 +Split splitncnn_7 1 2 420 420_splitncnn_0 420_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120 +Pooling 428 1 1 420_splitncnn_1 432 -23330=4,1,120,1,1 0=1 4=1 +InnerProduct 433 1 1 432 434 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1 +InnerProduct 435 1 1 434 435 -23330=4,1,120,1,1 0=120 1=1 2=3600 +HardSigmoid 440 1 1 435 440 -23330=4,1,120,1,1 +BinaryOp 449 2 1 420_splitncnn_0 440 449 -23330=4,3,28,28,120 0=2 +ReLU 450 1 1 449 450 -23330=4,3,28,28,120 +Convolution 451 1 1 450 451 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800 +BinaryOp 453 2 1 416_splitncnn_0 451 453 -23330=4,3,28,28,40 +Convolution 454 1 1 453 454 -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600 +HardSwish 461 1 1 454 461 -23330=4,3,28,28,240 +ConvolutionDepthWise 462 1 1 461 462 -23330=4,3,14,14,240 0=240 1=3 3=2 4=1 5=1 6=2160 7=240 +HardSwish 469 1 1 462 469 -23330=4,3,14,14,240 +Convolution 470 1 1 469 470 -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200 +Split splitncnn_8 1 2 470 470_splitncnn_0 470_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 +Convolution 472 1 1 470_splitncnn_1 472 -23330=4,3,14,14,200 0=200 1=1 5=1 6=16000 +HardSwish 479 1 1 472 479 -23330=4,3,14,14,200 +ConvolutionDepthWise 480 1 1 479 480 -23330=4,3,14,14,200 0=200 1=3 4=1 5=1 6=1800 7=200 +HardSwish 487 1 1 480 487 -23330=4,3,14,14,200 +Convolution 488 1 1 487 488 -23330=4,3,14,14,80 0=80 1=1 5=1 6=16000 +BinaryOp 490 2 1 470_splitncnn_0 488 490 -23330=4,3,14,14,80 +Split splitncnn_9 1 2 490 490_splitncnn_0 490_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 +Convolution 491 1 1 490_splitncnn_1 491 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720 +HardSwish 498 1 1 491 498 -23330=4,3,14,14,184 +ConvolutionDepthWise 499 1 1 498 499 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184 +HardSwish 506 1 1 499 506 -23330=4,3,14,14,184 +Convolution 507 1 1 506 507 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720 +BinaryOp 509 2 1 490_splitncnn_0 507 509 -23330=4,3,14,14,80 +Split splitncnn_10 1 2 509 509_splitncnn_0 509_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 +Convolution 510 1 1 509_splitncnn_1 510 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720 +HardSwish 517 1 1 510 517 -23330=4,3,14,14,184 +ConvolutionDepthWise 518 1 1 517 518 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184 +HardSwish 525 1 1 518 525 -23330=4,3,14,14,184 +Convolution 526 1 1 525 526 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720 +BinaryOp 528 2 1 509_splitncnn_0 526 528 -23330=4,3,14,14,80 +Convolution 529 1 1 528 529 -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400 +HardSwish 536 1 1 529 536 -23330=4,3,14,14,480 +ConvolutionDepthWise 537 1 1 536 537 -23330=4,3,14,14,480 0=480 1=3 4=1 5=1 6=4320 7=480 +Split splitncnn_11 1 2 537 537_splitncnn_0 537_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480 +Pooling 545 1 1 537_splitncnn_1 549 -23330=4,1,480,1,1 0=1 4=1 +InnerProduct 550 1 1 549 551 -23330=4,1,120,1,1 0=120 1=1 2=57600 9=1 +InnerProduct 552 1 1 551 552 -23330=4,1,480,1,1 0=480 1=1 2=57600 +HardSigmoid 557 1 1 552 557 -23330=4,1,480,1,1 +BinaryOp 566 2 1 537_splitncnn_0 557 566 -23330=4,3,14,14,480 0=2 +HardSwish 572 1 1 566 572 -23330=4,3,14,14,480 +Convolution 573 1 1 572 573 -23330=4,3,14,14,112 0=112 1=1 5=1 6=53760 +Split splitncnn_12 1 2 573 573_splitncnn_0 573_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112 +Convolution 575 1 1 573_splitncnn_1 575 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264 +HardSwish 582 1 1 575 582 -23330=4,3,14,14,672 +ConvolutionDepthWise 583 1 1 582 583 -23330=4,3,14,14,672 0=672 1=3 4=1 5=1 6=6048 7=672 +Split splitncnn_13 1 2 583 583_splitncnn_0 583_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672 +Pooling 591 1 1 583_splitncnn_1 595 -23330=4,1,672,1,1 0=1 4=1 +InnerProduct 596 1 1 595 597 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1 +InnerProduct 598 1 1 597 598 -23330=4,1,672,1,1 0=672 1=1 2=112896 +HardSigmoid 603 1 1 598 603 -23330=4,1,672,1,1 +BinaryOp 612 2 1 583_splitncnn_0 603 612 -23330=4,3,14,14,672 0=2 +HardSwish 618 1 1 612 618 -23330=4,3,14,14,672 +Convolution 619 1 1 618 619 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264 +BinaryOp 621 2 1 573_splitncnn_0 619 621 -23330=4,3,14,14,112 +Convolution 622 1 1 621 622 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264 +HardSwish 629 1 1 622 629 -23330=4,3,14,14,672 +ConvolutionDepthWise 630 1 1 629 630 -23330=4,3,14,14,672 0=672 1=5 4=2 5=1 6=16800 7=672 +Split splitncnn_14 1 2 630 630_splitncnn_0 630_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672 +Pooling 638 1 1 630_splitncnn_1 642 -23330=4,1,672,1,1 0=1 4=1 +InnerProduct 643 1 1 642 644 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1 +InnerProduct 645 1 1 644 645 -23330=4,1,672,1,1 0=672 1=1 2=112896 +HardSigmoid 650 1 1 645 650 -23330=4,1,672,1,1 +BinaryOp 659 2 1 630_splitncnn_0 650 659 -23330=4,3,14,14,672 0=2 +HardSwish 665 1 1 659 665 -23330=4,3,14,14,672 +Convolution 666 1 1 665 666 -23330=4,3,14,14,160 0=160 1=1 5=1 6=107520 +Convolution 668 1 1 666 668 -23330=4,3,14,14,672 0=672 1=1 5=1 6=107520 +HardSwish 675 1 1 668 675 -23330=4,3,14,14,672 +ConvolutionDepthWise 676 1 1 675 676 -23330=4,3,7,7,672 0=672 1=5 3=2 4=2 5=1 6=16800 7=672 +Split splitncnn_15 1 2 676 676_splitncnn_0 676_splitncnn_1 -23330=8,3,7,7,672,3,7,7,672 +Pooling 684 1 1 676_splitncnn_1 688 -23330=4,1,672,1,1 0=1 4=1 +InnerProduct 689 1 1 688 690 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1 +InnerProduct 691 1 1 690 691 -23330=4,1,672,1,1 0=672 1=1 2=112896 +HardSigmoid 696 1 1 691 696 -23330=4,1,672,1,1 +BinaryOp 705 2 1 676_splitncnn_0 696 705 -23330=4,3,7,7,672 0=2 +HardSwish 711 1 1 705 711 -23330=4,3,7,7,672 +Convolution 712 1 1 711 712 -23330=4,3,7,7,160 0=160 1=1 5=1 6=107520 +Split splitncnn_16 1 2 712 712_splitncnn_0 712_splitncnn_1 -23330=8,3,7,7,160,3,7,7,160 +Convolution 714 1 1 712_splitncnn_1 714 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600 +HardSwish 721 1 1 714 721 -23330=4,3,7,7,960 +ConvolutionDepthWise 722 1 1 721 722 -23330=4,3,7,7,960 0=960 1=5 4=2 5=1 6=24000 7=960 +Split splitncnn_17 1 2 722 722_splitncnn_0 722_splitncnn_1 -23330=8,3,7,7,960,3,7,7,960 +Pooling 730 1 1 722_splitncnn_1 734 -23330=4,1,960,1,1 0=1 4=1 +InnerProduct 735 1 1 734 736 -23330=4,1,240,1,1 0=240 1=1 2=230400 9=1 +InnerProduct 737 1 1 736 737 -23330=4,1,960,1,1 0=960 1=1 2=230400 +HardSigmoid 742 1 1 737 742 -23330=4,1,960,1,1 +BinaryOp 751 2 1 722_splitncnn_0 742 751 -23330=4,3,7,7,960 0=2 +HardSwish 757 1 1 751 757 -23330=4,3,7,7,960 +Convolution 758 1 1 757 758 -23330=4,3,7,7,160 0=160 1=1 5=1 6=153600 +BinaryOp 760 2 1 712_splitncnn_0 758 760 -23330=4,3,7,7,160 +Convolution 761 1 1 760 761 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600 +HardSwish 768 1 1 761 768 -23330=4,3,7,7,960 +Pooling 769 1 1 768 769 -23330=4,1,960,1,1 0=1 4=1 +HardSwish 775 1 1 769 775 -23330=4,1,960,1,1 +Reshape 783 1 1 775 783 -23330=4,1,960,1,1 0=-1 +InnerProduct 784 1 1 783 784 -23330=4,1,1280,1,1 0=1280 1=1 2=1228800 +HardSwish 790 1 1 784 790 -23330=4,1,1280,1,1 +InnerProduct 791 1 1 790 791 -23330=4,1,1000,1,1 0=1000 1=1 2=1280000 +Softmax prob 1 1 791 output -23330=4,1,1000,1,1 +)delimiter"; + +/** + * @brief 使用一个简单的 Sigmoid 网络预热并测试 Pipeline Cache 的基本保存和加载功能 + * @return 0 on success, -1 on failure + */ +static int warmup_gpu_pipecache() +{ + std::cout << "==================================================" << std::endl; + std::cout << " Warmup: Testing Basic Cache IO " << std::endl; + std::cout << "==================================================" << std::endl; + + // 1. 创建一个网络,运行一次以生成 pipeline + ncnn::Net net; + net.opt.use_vulkan_compute = true; + + net.load_param_mem("7767517\n2 2\nInput input0 0 1 input0\nSigmoid sigmoid0 1 1 input0 output0"); + net.load_model((unsigned char*)""); // 用于创建 pipeline + + ncnn::Mat input0 = RandomMat(224, 224); + ncnn::Mat output0; + { + ncnn::Extractor ex = net.create_extractor(); + ex.input("input0", input0); + ex.extract("output0", output0); + } + + if (output0.empty()) + { + std::cerr << "Warmup failed: initial extraction failed." << std::endl; + return -1; + } + + // 2. 保存 pipeline cache + const char* cache_path = "./sigmoid_pipecache.bin"; + if (net.opt.pipeline_cache->save_cache(cache_path) != 0) + { + std::cerr << "Warmup failed: could not save pipeline cache to " << cache_path << std::endl; + return -1; + } + std::cout << "Warmup: Pipeline cache saved successfully." << std::endl; + + // 3. 创建第二个网络,加载刚才保存的 cache + ncnn::Net net2; + net2.opt.use_vulkan_compute = true; + net2.opt.pipeline_cache = new ncnn::PipelineCache(net.vulkan_device()); + + net2.load_param_mem("7767517\n2 2\nInput input0 0 1 input0\nSigmoid sigmoid0 1 1 input0 output0"); + if (net2.opt.pipeline_cache->load_cache(cache_path) != 0) + { + std::cerr << "Warmup failed: could not load pipeline cache from " << cache_path << std::endl; + return -1; + } + std::cout << "Warmup: Pipeline cache loaded successfully." << std::endl; + net2.load_model((unsigned char*)""); // 创建 pipeline + + // 4. 再次推理并验证结果是否一致 + ncnn::Mat output0_2; + { + ncnn::Extractor ex2 = net2.create_extractor(); + ex2.input("input0", input0); + ex2.extract("output0", output0_2); + } + + if (output0_2.empty()) + { + std::cerr << "Warmup failed: extraction after loading cache failed." << std::endl; + return -1; + } + + if (CompareMat(output0, output0_2, 0.001) != 0) + { + std::cerr << "Warmup failed: output mismatch after loading cache." << std::endl; + return -1; + } + + std::cout << "Warmup PASSED: Outputs are identical." << std::endl; + return 0; +} + +/** + * @brief 对比使用和不使用 Pipeline Cache 时的模型加载性能 + * @return 0 on success, -1 on failure + */ +static int test_gpu_pipecache_performance() +{ + ncnn::Mat output_no_cache; + double time_no_cache = 0; + + const char* cache_path = "./mobilenet_pipecache.bin"; + DataReaderFromEmpty dr; + ncnn::Mat input = RandomMat(224, 224, 3); + + // ------------------------------------------------- + // 1. 不使用 Pipeline Cache (首次加载) + // ------------------------------------------------- + std::cout << "\n==================================================" << std::endl; + std::cout << " Performance Test: Without Pipeline Cache " << std::endl; + std::cout << "==================================================" << std::endl; + { + ncnn::Net net_no_cache; + net_no_cache.opt.use_vulkan_compute = true; + + auto start = std::chrono::high_resolution_clock::now(); + + net_no_cache.load_param_mem(mobilenet_v3_param); + net_no_cache.load_model(dr); + + auto end = std::chrono::high_resolution_clock::now(); + time_no_cache = std::chrono::duration_cast>(end - start).count(); + std::cout << "Model loading time without cache: " << time_no_cache << " ms" << std::endl; + + // 推理以获得基准输出 + ncnn::Extractor ex = net_no_cache.create_extractor(); + ex.input("data", input); + ex.extract("output", output_no_cache); + + if (output_no_cache.empty()) + { + std::cerr << "Test failed: extraction without cache failed." << std::endl; + return -1; + } + + // 保存 cache 以供下一步使用 + if (net_no_cache.opt.pipeline_cache->save_cache(cache_path) != 0) + { + std::cerr << "Test failed: could not save pipeline cache to " << cache_path << std::endl; + return -1; + } + std::cout << "Pipeline cache generated and saved to " << cache_path << std::endl; + } + + // ------------------------------------------------- + // 2. 使用 Pipeline Cache (二次加载) + // ------------------------------------------------- + ncnn::Mat output_with_cache; + double time_with_cache = 0; + std::cout << "\n==================================================" << std::endl; + std::cout << " Performance Test: With Pipeline Cache " << std::endl; + std::cout << "==================================================" << std::endl; + { + ncnn::Net net_with_cache; + // 必须在加载模型前设置好 cache + net_with_cache.opt.pipeline_cache = new ncnn::PipelineCache(ncnn::get_gpu_device()); + net_with_cache.opt.use_vulkan_compute = true; + + auto start = std::chrono::high_resolution_clock::now(); + + // 从文件加载 cache + if (net_with_cache.opt.pipeline_cache->load_cache(cache_path) != 0) + { + std::cerr << "Test failed: could not load pipeline cache from " << cache_path << std::endl; + return -1; + } + net_with_cache.load_param_mem(mobilenet_v3_param); + net_with_cache.load_model(dr); + + auto end = std::chrono::high_resolution_clock::now(); + time_with_cache = std::chrono::duration_cast>(end - start).count(); + std::cout << "Model loading time with cache: " << time_with_cache << " ms" << std::endl; + + // 推理 + ncnn::Extractor ex2 = net_with_cache.create_extractor(); + ex2.input("data", input); + ex2.extract("output", output_with_cache); + + if (output_with_cache.empty()) + { + std::cerr << "Test failed: extraction with cache failed." << std::endl; + return -1; + } + } + + // ------------------------------------------------- + // 3. 结果验证与总结 + // ------------------------------------------------- + std::cout << "\n==================================================" << std::endl; + std::cout << " Verification and Summary " << std::endl; + std::cout << "==================================================" << std::endl; + + bool is_output_same = (CompareMat(output_no_cache, output_with_cache, 0.001) == 0); + + std::cout << "Output verification: " << (is_output_same ? "SUCCESS" : "FAILURE") << std::endl; + std::cout << "--------------------------------------------------" << std::endl; + std::cout << "Performance Summary:" << std::endl; + std::cout << " - Without Cache: " << time_no_cache << " ms" << std::endl; + std::cout << " - With Cache: " << time_with_cache << " ms" << std::endl; + + if (time_no_cache > 0) { + double speedup = (time_no_cache - time_with_cache) / time_no_cache * 100; + std::cout << " - Speedup: " << speedup << "%" << std::endl; + } + + if (!is_output_same) + { + std::cerr << "\nTest FAILED due to output mismatch." << std::endl; + return -1; + } + + std::cout << "\nTest PASSED." << std::endl; + return 0; +} + +int main() +{ + // 运行预热测试,检查基本IO功能 + if (warmup_gpu_pipecache() != 0) + { + return -1; + } + + // 运行性能对比测试 + return test_gpu_pipecache_performance(); +}