| @@ -98,6 +98,7 @@ struct layer_shader_registry_entry | |||
| static const layer_shader_registry_entry layer_shader_registry[] = { | |||
| #include "layer_shader_registry.h" | |||
| }; | |||
| static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry); | |||
| @@ -476,16 +477,16 @@ void GpuInfoPrivate::query_properties() | |||
| } | |||
| if (physicalDeviceProperties.vendorID == 0x13b5 | |||
| && (physicalDeviceProperties.deviceID == 0x7500001 | |||
| || physicalDeviceProperties.deviceID == 0x7501000 | |||
| || physicalDeviceProperties.deviceID == 0x8602000 | |||
| || physicalDeviceProperties.deviceID == 0x8800020 | |||
| || physicalDeviceProperties.deviceID == 0x70930000 | |||
| || physicalDeviceProperties.deviceID == 0x70901010 | |||
| || physicalDeviceProperties.deviceID == 0x72120000 | |||
| || physicalDeviceProperties.deviceID == 0x74021000 | |||
| || physicalDeviceProperties.deviceID == 0x60a00002 | |||
| || physicalDeviceProperties.deviceID == 0x62210001)) | |||
| && (physicalDeviceProperties.deviceID == 0x7500001 | |||
| || physicalDeviceProperties.deviceID == 0x7501000 | |||
| || physicalDeviceProperties.deviceID == 0x8602000 | |||
| || physicalDeviceProperties.deviceID == 0x8800020 | |||
| || physicalDeviceProperties.deviceID == 0x70930000 | |||
| || physicalDeviceProperties.deviceID == 0x70901010 | |||
| || physicalDeviceProperties.deviceID == 0x72120000 | |||
| || physicalDeviceProperties.deviceID == 0x74021000 | |||
| || physicalDeviceProperties.deviceID == 0x60a00002 | |||
| || physicalDeviceProperties.deviceID == 0x62210001)) | |||
| { | |||
| // NOTE rk3288/rk3399/t880/g31/g51/g52/g71/g72 | |||
| // however, g76/g77 has explicit fp16 arithmetic | |||
| @@ -494,9 +495,9 @@ void GpuInfoPrivate::query_properties() | |||
| } | |||
| if (physicalDeviceProperties.vendorID == 0x5143 | |||
| && (physicalDeviceProperties.deviceID == 0x6030001 | |||
| || physicalDeviceProperties.deviceID == 0x6040001 | |||
| || physicalDeviceProperties.deviceID == 0x6050002)) | |||
| && (physicalDeviceProperties.deviceID == 0x6030001 | |||
| || physicalDeviceProperties.deviceID == 0x6040001 | |||
| || physicalDeviceProperties.deviceID == 0x6050002)) | |||
| { | |||
| // TODO enable devices other than qcom845/qcom855/qcom855plus/qcom865 | |||
| // qcom adreno driver accept spirv with fp16 arithmetic | |||
| @@ -512,7 +513,7 @@ static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyPropert | |||
| const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i]; | |||
| if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT) | |||
| && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)) | |||
| && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)) | |||
| { | |||
| return i; | |||
| } | |||
| @@ -524,7 +525,7 @@ static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyPropert | |||
| const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i]; | |||
| if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT) | |||
| && (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)) | |||
| && (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)) | |||
| { | |||
| return i; | |||
| } | |||
| @@ -553,8 +554,8 @@ static uint32_t find_device_transfer_queue(const std::vector<VkQueueFamilyProper | |||
| const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i]; | |||
| if ((queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT) | |||
| && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT) | |||
| && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)) | |||
| && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT) | |||
| && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)) | |||
| { | |||
| return i; | |||
| } | |||
| @@ -1137,30 +1138,30 @@ void GpuInfoPrivate::query_extension_properties() | |||
| // NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope); | |||
| if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16 | |||
| && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR | |||
| && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR | |||
| && cmp.scope == VK_SCOPE_SUBGROUP_KHR) | |||
| && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR | |||
| && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR | |||
| && cmp.scope == VK_SCOPE_SUBGROUP_KHR) | |||
| { | |||
| support_cooperative_matrix_8_8_16 = true; | |||
| } | |||
| if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8 | |||
| && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR | |||
| && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR | |||
| && cmp.scope == VK_SCOPE_SUBGROUP_KHR) | |||
| && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR | |||
| && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR | |||
| && cmp.scope == VK_SCOPE_SUBGROUP_KHR) | |||
| { | |||
| support_cooperative_matrix_16_8_8 = true; | |||
| } | |||
| if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16 | |||
| && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR | |||
| && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR | |||
| && cmp.scope == VK_SCOPE_SUBGROUP_KHR) | |||
| && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR | |||
| && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR | |||
| && cmp.scope == VK_SCOPE_SUBGROUP_KHR) | |||
| { | |||
| support_cooperative_matrix_16_8_16 = true; | |||
| } | |||
| if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16 | |||
| && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR | |||
| && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR | |||
| && cmp.scope == VK_SCOPE_SUBGROUP_KHR) | |||
| && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR | |||
| && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR | |||
| && cmp.scope == VK_SCOPE_SUBGROUP_KHR) | |||
| { | |||
| support_cooperative_matrix_16_16_16 = true; | |||
| } | |||
| @@ -1194,30 +1195,30 @@ void GpuInfoPrivate::query_extension_properties() | |||
| // NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope); | |||
| if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16 | |||
| && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV | |||
| && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV | |||
| && cmp.scope == VK_SCOPE_SUBGROUP_NV) | |||
| && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV | |||
| && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV | |||
| && cmp.scope == VK_SCOPE_SUBGROUP_NV) | |||
| { | |||
| support_cooperative_matrix_8_8_16 = true; | |||
| } | |||
| if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8 | |||
| && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV | |||
| && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV | |||
| && cmp.scope == VK_SCOPE_SUBGROUP_NV) | |||
| && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV | |||
| && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV | |||
| && cmp.scope == VK_SCOPE_SUBGROUP_NV) | |||
| { | |||
| support_cooperative_matrix_16_8_8 = true; | |||
| } | |||
| if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16 | |||
| && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV | |||
| && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV | |||
| && cmp.scope == VK_SCOPE_SUBGROUP_NV) | |||
| && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV | |||
| && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV | |||
| && cmp.scope == VK_SCOPE_SUBGROUP_NV) | |||
| { | |||
| support_cooperative_matrix_16_8_16 = true; | |||
| } | |||
| if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16 | |||
| && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV | |||
| && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV | |||
| && cmp.scope == VK_SCOPE_SUBGROUP_NV) | |||
| && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV | |||
| && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV | |||
| && cmp.scope == VK_SCOPE_SUBGROUP_NV) | |||
| { | |||
| support_cooperative_matrix_16_16_16 = true; | |||
| } | |||
| @@ -2031,8 +2032,8 @@ void GpuInfo::get_optimal_cooperative_matrix_mnk(int M, int N, int K, VkComponen | |||
| const VkCooperativeMatrixPropertiesKHR& cmp = d->queryCooperativeMatrixSubProperties[i]; | |||
| if (cmp.AType == type && cmp.BType == type | |||
| && cmp.CType == acctype && cmp.ResultType == acctype | |||
| && cmp.scope == scope) | |||
| && cmp.CType == acctype && cmp.ResultType == acctype | |||
| && cmp.scope == scope) | |||
| { | |||
| mnk_properties.push_back(cmp); | |||
| } | |||
| @@ -2045,8 +2046,8 @@ void GpuInfo::get_optimal_cooperative_matrix_mnk(int M, int N, int K, VkComponen | |||
| const VkCooperativeMatrixPropertiesNV& cmp = d->queryCooperativeMatrixSubPropertiesNV[i]; | |||
| if (cmp.AType == (VkComponentTypeNV)type && cmp.BType == (VkComponentTypeNV)type | |||
| && cmp.CType == (VkComponentTypeNV)acctype && cmp.DType == (VkComponentTypeNV)acctype | |||
| && cmp.scope == (VkScopeNV)scope) | |||
| && cmp.CType == (VkComponentTypeNV)acctype && cmp.DType == (VkComponentTypeNV)acctype | |||
| && cmp.scope == (VkScopeNV)scope) | |||
| { | |||
| VkCooperativeMatrixPropertiesKHR cmp_khr; | |||
| cmp_khr.MSize = cmp.MSize; | |||
| @@ -2459,7 +2460,7 @@ int create_gpu_instance(const char* driver_path) | |||
| #endif // __ANDROID_API__ >= 26 | |||
| uint32_t instance_api_version = VK_MAKE_VERSION(1, 0, 0); | |||
| typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t * pApiVersion); | |||
| typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t* pApiVersion); | |||
| PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion = (PFN_vkEnumerateInstanceVersion)vkGetInstanceProcAddr(0, "vkEnumerateInstanceVersion"); | |||
| if (vkEnumerateInstanceVersion) | |||
| { | |||
| @@ -2672,7 +2673,7 @@ int create_gpu_instance(const char* driver_path) | |||
| fp16_matrix_properties.push_back(cmp); | |||
| } | |||
| if ((cmp.AType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.AType == VK_COMPONENT_TYPE_SINT8_PACKED_NV) | |||
| && (cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.BType == VK_COMPONENT_TYPE_SINT8_PACKED_NV)) | |||
| && (cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.BType == VK_COMPONENT_TYPE_SINT8_PACKED_NV)) | |||
| { | |||
| bool mnk_hit = false; | |||
| for (size_t k = 0; k < int8_matrix_properties.size(); k++) | |||
| @@ -2703,9 +2704,9 @@ int create_gpu_instance(const char* driver_path) | |||
| bf16_matrix_properties.push_back(cmp); | |||
| } | |||
| if ((cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT | |||
| || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV) | |||
| && (cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT | |||
| || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV)) | |||
| || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV) | |||
| && (cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT | |||
| || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV)) | |||
| { | |||
| bool mnk_hit = false; | |||
| for (size_t k = 0; k < fp8_matrix_properties.size(); k++) | |||
| @@ -3143,8 +3144,9 @@ const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_ | |||
| uop->vkdev = vkdev; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 : 8); // out_elempack | |||
| pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16 3=int8 | |||
| pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 | |||
| : 8); // out_elempack | |||
| pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16 3=int8 | |||
| pd.set(3, cast_type_to_index + 1); | |||
| uop->load_param(pd); | |||
| @@ -3734,7 +3736,7 @@ int VulkanDevice::create_pipeline_layout(int push_constant_count, VkDescriptorSe | |||
| return 0; | |||
| } | |||
| int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline) const | |||
| int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline, VkPipelineCache pipeline_cache) const | |||
| { | |||
| const int specialization_count = specializations.size(); | |||
| @@ -3792,7 +3794,7 @@ int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout | |||
| computePipelineCreateInfo.basePipelineHandle = 0; | |||
| computePipelineCreateInfo.basePipelineIndex = 0; | |||
| VkResult ret = vkCreateComputePipelines(d->device, 0, 1, &computePipelineCreateInfo, 0, pipeline); | |||
| VkResult ret = vkCreateComputePipelines(d->device, pipeline_cache, 1, &computePipelineCreateInfo, 0, pipeline); | |||
| if (ret != VK_SUCCESS) | |||
| { | |||
| NCNN_LOGE("vkCreateComputePipelines failed %d", ret); | |||
| @@ -3871,6 +3873,18 @@ int VulkanDevice::create_descriptor_update_template(int binding_count, const int | |||
| return 0; | |||
| } | |||
| int VulkanDevice::create_pipeline_cache(const VkPipelineCacheCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkPipelineCache* pPipelineCache) const | |||
| { | |||
| VkResult ret = vkCreatePipelineCache(d->device, pCreateInfo, pAllocator, pPipelineCache); | |||
| if (ret != VK_SUCCESS) | |||
| { | |||
| NCNN_LOGE("vkCreatePipelineCache failed %d", ret); | |||
| return -1; | |||
| } | |||
| return 0; | |||
| } | |||
| uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const | |||
| { | |||
| const VkPhysicalDeviceMemoryProperties& memory_properties = info.physicalDeviceMemoryProperties(); | |||
| @@ -3883,8 +3897,8 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ | |||
| { | |||
| const VkMemoryType& memoryType = memory_properties.memoryTypes[i]; | |||
| if ((memoryType.propertyFlags & required) == required | |||
| && (preferred && (memoryType.propertyFlags & preferred)) | |||
| && (preferred_not && !(memoryType.propertyFlags & preferred_not))) | |||
| && (preferred && (memoryType.propertyFlags & preferred)) | |||
| && (preferred_not && !(memoryType.propertyFlags & preferred_not))) | |||
| { | |||
| return i; | |||
| } | |||
| @@ -3899,7 +3913,7 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ | |||
| { | |||
| const VkMemoryType& memoryType = memory_properties.memoryTypes[i]; | |||
| if ((memoryType.propertyFlags & required) == required | |||
| && (preferred && (memoryType.propertyFlags & preferred))) | |||
| && (preferred && (memoryType.propertyFlags & preferred))) | |||
| { | |||
| return i; | |||
| } | |||
| @@ -3914,7 +3928,7 @@ uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags requ | |||
| { | |||
| const VkMemoryType& memoryType = memory_properties.memoryTypes[i]; | |||
| if ((memoryType.propertyFlags & required) == required | |||
| && (preferred_not && !(memoryType.propertyFlags & preferred_not))) | |||
| && (preferred_not && !(memoryType.propertyFlags & preferred_not))) | |||
| { | |||
| return i; | |||
| } | |||
| @@ -4222,7 +4236,8 @@ void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempac | |||
| void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, int cast_type_to, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2; | |||
| int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 | |||
| : 2; | |||
| int cast_type_from_index; | |||
| if (src.elembits() == 32) | |||
| @@ -419,8 +419,9 @@ public: | |||
| // helper for creating pipeline | |||
| int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const; | |||
| int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const; | |||
| int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline) const; | |||
| int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline, VkPipelineCache pipeline_cache = 0) const; | |||
| int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const; | |||
| int create_pipeline_cache(const VkPipelineCacheCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkPipelineCache* pPipelineCache) const; | |||
| uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const; | |||
| bool is_mappable(uint32_t memory_type_index) const; | |||
| @@ -110,13 +110,42 @@ public: | |||
| ShaderInfo shader_info; // TODO use pointer ? | |||
| }; | |||
| struct spv_param | |||
| { | |||
| union | |||
| { | |||
| struct | |||
| { | |||
| int32_t shader_type_index; | |||
| uint32_t opt_bits; | |||
| }; | |||
| uint64_t d0; | |||
| }; | |||
| }; | |||
| struct pipeline_cache_header | |||
| { | |||
| uint32_t magic = 0x5a545546; | |||
| uint32_t vendorID; // VkPhysicalDeviceProperties::vendorID | |||
| uint32_t deviceID; // VkPhysicalDeviceProperties::deviceID | |||
| uint32_t driverVersion; // VkPhysicalDeviceProperties::driverVersion | |||
| uint8_t uuid[VK_UUID_SIZE]; // VkPhysicalDeviceProperties::pipelineCacheUUID | |||
| uint32_t spv_size; // size of spirv data | |||
| uint32_t pipeline_cache_size; | |||
| }; | |||
| mutable std::vector<pipeline_cache_digest> cache_digests; | |||
| mutable std::vector<pipeline_cache_artifact> cache_artifacts; | |||
| VkPipelineCache vk_pipeline_cache; | |||
| mutable std::vector<std::pair<spv_param, std::vector<uint32_t> > > cache_spirv_module; // digest(index,opt) -> spirv data | |||
| mutable Mutex cache_lock; | |||
| }; | |||
| PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations, | |||
| uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size) | |||
| uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size) | |||
| { | |||
| spv_data_murmur3 = murmur3_32(spv_data, spv_data_size / 4); | |||
| @@ -134,7 +163,7 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(const uint32_ | |||
| } | |||
| PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(int _shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations, | |||
| uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size) | |||
| uint32_t _local_size_x, uint32_t _local_size_y, uint32_t _local_size_z, uint32_t _subgroup_size) | |||
| { | |||
| shader_type_index = _shader_type_index; | |||
| @@ -160,6 +189,18 @@ PipelineCachePrivate::pipeline_cache_digest::pipeline_cache_digest(int _shader_t | |||
| PipelineCache::PipelineCache(const VulkanDevice* _vkdev) | |||
| : vkdev(_vkdev), d(new PipelineCachePrivate) | |||
| { | |||
| VkPipelineCacheCreateInfo pipelineCacheCreateInfo{}; | |||
| pipelineCacheCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO; | |||
| pipelineCacheCreateInfo.initialDataSize = 0; // zeros for empty cache | |||
| pipelineCacheCreateInfo.pInitialData = nullptr; | |||
| int ret = 0; | |||
| ret = _vkdev->create_pipeline_cache(&pipelineCacheCreateInfo, 0, &d->vk_pipeline_cache); | |||
| if (ret != 0) | |||
| { | |||
| NCNN_LOGE("create_pipeline_cache failed %d", ret); | |||
| d->vk_pipeline_cache = 0; | |||
| } | |||
| } | |||
| PipelineCache::~PipelineCache() | |||
| @@ -381,18 +422,288 @@ int PipelineCache::get_pipeline(int shader_type_index, const Option& opt, const | |||
| return 0; | |||
| } | |||
| int PipelineCache::save_cache(std::vector<unsigned char>& buf) const | |||
| { | |||
| if (!vkdev) | |||
| { | |||
| NCNN_LOGE("vkdev is null"); | |||
| return -1; | |||
| } | |||
| MutexLockGuard lock(d->cache_lock); | |||
| PipelineCachePrivate::pipeline_cache_header header; | |||
| // Platform information | |||
| header.vendorID = vkdev->info.vendor_id(); | |||
| header.deviceID = vkdev->info.device_id(); | |||
| header.driverVersion = vkdev->info.driver_version(); | |||
| memcpy(header.uuid, vkdev->info.pipeline_cache_uuid(), VK_UUID_SIZE); | |||
| header.spv_size = d->cache_spirv_module.size(); | |||
| size_t buf_size = 0; | |||
| if (vkGetPipelineCacheData(vkdev->vkdevice(), d->vk_pipeline_cache, &buf_size, nullptr) != VK_SUCCESS) | |||
| { | |||
| NCNN_LOGE("vkGetPipelineCacheData failed"); | |||
| return -1; | |||
| } | |||
| header.pipeline_cache_size = (uint32_t)buf_size; | |||
| std::vector<unsigned char> pipe_data(header.pipeline_cache_size); | |||
| if (vkGetPipelineCacheData(vkdev->vkdevice(), d->vk_pipeline_cache, &buf_size, pipe_data.data()) != VK_SUCCESS) | |||
| { | |||
| NCNN_LOGE("vkGetPipelineCacheData failed"); | |||
| return -1; | |||
| } | |||
| buf.resize(sizeof(header)); | |||
| memcpy(buf.data(), &header, sizeof(header)); | |||
| // spv_digest and spv_data | |||
| for (size_t i = 0; i < d->cache_spirv_module.size(); i++) | |||
| { | |||
| const PipelineCachePrivate::spv_param& sd = d->cache_spirv_module[i].first; | |||
| const std::vector<uint32_t>& spv_data = d->cache_spirv_module[i].second; | |||
| uint32_t size = (uint32_t)spv_data.size(); | |||
| size_t current_buf_size = buf.size(); | |||
| buf.resize(current_buf_size + sizeof(sd) + sizeof(size) + spv_data.size() * sizeof(uint32_t)); | |||
| memcpy(buf.data() + current_buf_size, &sd, sizeof(sd)); | |||
| current_buf_size += sizeof(sd); | |||
| memcpy(buf.data() + current_buf_size, &size, sizeof(size)); | |||
| current_buf_size += sizeof(size); | |||
| memcpy(buf.data() + current_buf_size, spv_data.data(), spv_data.size() * sizeof(uint32_t)); | |||
| } | |||
| buf.insert(buf.end(), pipe_data.begin(), pipe_data.end()); | |||
| return 0; | |||
| } | |||
| int PipelineCache::load_cache(const std::vector<unsigned char>& buf) const | |||
| { | |||
| if (!vkdev) | |||
| { | |||
| NCNN_LOGE("vkdev is null"); | |||
| return -1; | |||
| } | |||
| MutexLockGuard lock(d->cache_lock); | |||
| // Corrected struct name to pipeline_cache_header (lowercase h) | |||
| if (buf.size() < sizeof(PipelineCachePrivate::pipeline_cache_header)) | |||
| { | |||
| NCNN_LOGE("Invalid cache buffer size: too small for header"); | |||
| return -1; | |||
| } | |||
| PipelineCachePrivate::pipeline_cache_header header; | |||
| memcpy(&header, buf.data(), sizeof(header)); | |||
| // Validate magic number | |||
| if (header.magic != 0x5a545546) | |||
| { | |||
| NCNN_LOGE("Invalid cache magic number"); | |||
| return -1; | |||
| } | |||
| // Validate platform information for compatibility | |||
| if (header.vendorID != vkdev->info.vendor_id() || header.deviceID != vkdev->info.device_id() || header.driverVersion != vkdev->info.driver_version() || memcmp(header.uuid, vkdev->info.pipeline_cache_uuid(), VK_UUID_SIZE) != 0) | |||
| { | |||
| NCNN_LOGE("Cache platform mismatch, might be incompatible."); | |||
| return -1; | |||
| } | |||
| size_t current_offset = sizeof(header); | |||
| // Load SPIR-V data and associated spv_param | |||
| d->cache_spirv_module.reserve(header.spv_size); | |||
| for (uint32_t i = 0; i < header.spv_size; ++i) | |||
| { | |||
| if (current_offset + sizeof(PipelineCachePrivate::spv_param) + sizeof(uint32_t) > buf.size()) | |||
| { | |||
| NCNN_LOGE("Invalid cache buffer size: incomplete spv_param or size for entry %u", i); | |||
| return -1; | |||
| } | |||
| PipelineCachePrivate::spv_param sd; | |||
| memcpy(&sd, buf.data() + current_offset, sizeof(sd)); | |||
| current_offset += sizeof(sd); | |||
| uint32_t spv_vec_size_uint32; // Size in uint32_t units | |||
| memcpy(&spv_vec_size_uint32, buf.data() + current_offset, sizeof(spv_vec_size_uint32)); | |||
| current_offset += sizeof(spv_vec_size_uint32); | |||
| size_t spv_data_byte_size = spv_vec_size_uint32 * sizeof(uint32_t); | |||
| if (current_offset + spv_data_byte_size > buf.size()) | |||
| { | |||
| NCNN_LOGE("Invalid cache buffer size: incomplete spv_data for entry %u", i); | |||
| return -1; | |||
| } | |||
| std::vector<uint32_t> spirv_data(spv_vec_size_uint32); | |||
| memcpy(spirv_data.data(), buf.data() + current_offset, spv_data_byte_size); | |||
| current_offset += spv_data_byte_size; | |||
| d->cache_spirv_module.push_back({sd, spirv_data}); | |||
| } | |||
| // Load Vulkan Pipeline Cache Data | |||
| if (current_offset + header.pipeline_cache_size > buf.size()) | |||
| { | |||
| NCNN_LOGE("Invalid cache buffer size: incomplete pipeline cache data"); | |||
| return -1; | |||
| } | |||
| if (d->vk_pipeline_cache) | |||
| { | |||
| vkDestroyPipelineCache(vkdev->vkdevice(), d->vk_pipeline_cache, 0); | |||
| d->vk_pipeline_cache = 0; | |||
| } | |||
| VkPipelineCacheCreateInfo pipelineCacheCreateInfo{}; | |||
| pipelineCacheCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO; | |||
| pipelineCacheCreateInfo.initialDataSize = header.pipeline_cache_size; | |||
| pipelineCacheCreateInfo.pInitialData = buf.data() + current_offset; | |||
| int ret = vkdev->create_pipeline_cache(&pipelineCacheCreateInfo, 0, &d->vk_pipeline_cache); | |||
| if (ret != 0) | |||
| { | |||
| NCNN_LOGE("create_pipeline_cache with initial data failed %d", ret); | |||
| d->vk_pipeline_cache = 0; | |||
| return -1; | |||
| } | |||
| return 0; | |||
| } | |||
| int PipelineCache::save_cache(FILE* fp) const | |||
| { | |||
| if (!fp) | |||
| { | |||
| NCNN_LOGE("Invalid FILE pointer for saving cache."); | |||
| return -1; | |||
| } | |||
| std::vector<unsigned char> buf; | |||
| int ret = save_cache(buf); | |||
| if (ret != 0) | |||
| { | |||
| NCNN_LOGE("Failed to get cache data into buffer for saving to file."); | |||
| return ret; | |||
| } | |||
| if (fwrite(buf.data(), 1, buf.size(), fp) != buf.size()) | |||
| { | |||
| NCNN_LOGE("Failed to write cache data to file."); | |||
| return -1; | |||
| } | |||
| return 0; | |||
| } | |||
| int PipelineCache::load_cache(FILE* fp) const | |||
| { | |||
| if (!fp) | |||
| { | |||
| NCNN_LOGE("Invalid FILE pointer for loading cache."); | |||
| return -1; | |||
| } | |||
| fseek(fp, 0, SEEK_END); | |||
| long file_size = ftell(fp); | |||
| fseek(fp, 0, SEEK_SET); | |||
| if (file_size < 0) | |||
| { | |||
| NCNN_LOGE("Failed to determine file size for loading cache."); | |||
| return -1; | |||
| } | |||
| std::vector<unsigned char> buf(file_size); | |||
| if (fread(buf.data(), 1, file_size, fp) != (size_t)file_size) | |||
| { | |||
| NCNN_LOGE("Failed to read cache data from file."); | |||
| return -1; | |||
| } | |||
| return load_cache(buf); | |||
| } | |||
| int PipelineCache::save_cache(const char* filename) const | |||
| { | |||
| if (!filename) | |||
| { | |||
| NCNN_LOGE("Invalid filename for saving cache."); | |||
| return -1; | |||
| } | |||
| FILE* fp = fopen(filename, "wb"); | |||
| if (!fp) | |||
| { | |||
| NCNN_LOGE("Failed to open file %s for writing cache.", filename); | |||
| return -1; | |||
| } | |||
| int ret = save_cache(fp); | |||
| fclose(fp); | |||
| return ret; | |||
| } | |||
| int PipelineCache::load_cache(const char* filename) const | |||
| { | |||
| if (!filename) | |||
| { | |||
| NCNN_LOGE("Invalid filename for loading cache."); | |||
| return -1; | |||
| } | |||
| FILE* fp = fopen(filename, "rb"); | |||
| if (!fp) | |||
| { | |||
| NCNN_LOGE("Failed to open file %s for reading cache.", filename); | |||
| return -1; | |||
| } | |||
| int ret = load_cache(fp); | |||
| fclose(fp); | |||
| return ret; | |||
| } | |||
| int PipelineCache::create_shader_module(int shader_type_index, const Option& opt, | |||
| uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, | |||
| VkShaderModule* _shader_module, ShaderInfo& si) const | |||
| { | |||
| uint32_t opt_bits = 0 << 7 | |||
| | opt.use_fp16_packed << 6 | |||
| | opt.use_fp16_storage << 5 | |||
| | opt.use_fp16_arithmetic << 4 | |||
| | opt.use_int8_storage << 3 | |||
| | opt.use_int8_arithmetic << 2; | |||
| std::vector<uint32_t> spirv; | |||
| for (int i = 0; i < d->cache_spirv_module.size(); i++) | |||
| { | |||
| if (d->cache_spirv_module[i].first.d0 == PipelineCachePrivate::spv_param({shader_type_index, opt_bits}).d0) // hit cache | |||
| { | |||
| spirv = d->cache_spirv_module[i].second; | |||
| goto hit_cache; | |||
| } | |||
| } | |||
| int retc = compile_spirv_module(shader_type_index, opt, spirv); | |||
| if (retc != 0) | |||
| { | |||
| NCNN_LOGE("compile_spirv_module failed %d", retc); | |||
| return -1; | |||
| } | |||
| d->cache_spirv_module.push_back({{shader_type_index, opt_bits}, spirv}); | |||
| hit_cache: | |||
| const uint32_t* spv_data = spirv.data(); | |||
| size_t spv_data_size = spirv.size() * 4; | |||
| @@ -445,7 +756,7 @@ int PipelineCache::new_pipeline(VkShaderModule shader_module, const ShaderInfo& | |||
| if (ret != 0) | |||
| goto ERROR_PipelineCache; | |||
| ret = vkdev->create_pipeline(shader_module, pipeline_layout, specializations, subgroup_size, &pipeline); | |||
| ret = vkdev->create_pipeline(shader_module, pipeline_layout, specializations, subgroup_size, &pipeline, d->vk_pipeline_cache); | |||
| if (ret != 0) | |||
| goto ERROR_PipelineCache; | |||
| @@ -42,6 +42,16 @@ public: | |||
| VkDescriptorUpdateTemplateKHR* descriptor_update_template, | |||
| ShaderInfo& shader_info) const; | |||
| int save_cache(std::vector<unsigned char> &buf) const; | |||
| int load_cache(const std::vector<unsigned char>& buf) const; | |||
| #ifdef NCNN_STDIO | |||
| int save_cache(FILE* fp) const; | |||
| int load_cache(FILE* fp) const; | |||
| int save_cache(const char* fp) const; | |||
| int load_cache(const char* fp) const; | |||
| #endif | |||
| protected: | |||
| int create_shader_module(int shader_type_index, const Option& opt, | |||
| uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, | |||
| @@ -65,6 +65,7 @@ ncnn_add_test(paramdict) | |||
| if(NCNN_VULKAN) | |||
| ncnn_add_test(command) | |||
| ncnn_add_test(pipecache) | |||
| endif() | |||
| if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") | |||
| @@ -0,0 +1,405 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2025 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "datareader.h" | |||
| #include "gpu.h" | |||
| #include "mat.h" | |||
| #include "net.h" | |||
| #include "pipelinecache.h" | |||
| #include "testutil.h" | |||
| #include <iostream> | |||
| #include <chrono> | |||
| #include <vector> | |||
| // 一个空数据读取器,用于加载模型结构,权重将全部为0 | |||
| class DataReaderFromEmpty : public ncnn::DataReader | |||
| { | |||
| public: | |||
| virtual int scan(const char* format, void* p) const | |||
| { | |||
| (void)format; // unused | |||
| (void)p; // unused | |||
| return 0; | |||
| } | |||
| virtual size_t read(void* buf, size_t size) const | |||
| { | |||
| memset(buf, 0, size); | |||
| return size; | |||
| } | |||
| }; | |||
| // MobileNetV3 的网络结构参数 | |||
| static const char* mobilenet_v3_param = R"delimiter( | |||
| 7767517 | |||
| 145 163 | |||
| Input data 0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3 | |||
| Convolution 313 1 1 data 313 -23330=4,3,112,112,16 0=16 1=3 3=2 4=1 5=1 6=432 | |||
| Split splitncnn_0 1 2 313 313_splitncnn_0 313_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16 | |||
| HardSigmoid 319 1 1 313_splitncnn_1 319 -23330=4,3,112,112,16 | |||
| BinaryOp 320 2 1 313_splitncnn_0 319 320 -23330=4,3,112,112,16 0=2 | |||
| Split splitncnn_1 1 2 320 320_splitncnn_0 320_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16 | |||
| ConvolutionDepthWise 321 1 1 320_splitncnn_1 323 -23330=4,3,112,112,16 0=16 1=3 4=1 5=1 6=144 7=16 9=1 | |||
| Convolution 324 1 1 323 324 -23330=4,3,112,112,16 0=16 1=1 5=1 6=256 | |||
| BinaryOp 326 2 1 320_splitncnn_0 324 326 -23330=4,3,112,112,16 | |||
| Convolution 327 1 1 326 329 -23330=4,3,112,112,64 0=64 1=1 5=1 6=1024 9=1 | |||
| ConvolutionDepthWise 330 1 1 329 332 -23330=4,3,56,56,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 9=1 | |||
| Convolution 333 1 1 332 333 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1536 | |||
| Split splitncnn_2 1 2 333 333_splitncnn_0 333_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24 | |||
| Convolution 335 1 1 333_splitncnn_1 337 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1 | |||
| ConvolutionDepthWise 338 1 1 337 340 -23330=4,3,56,56,72 0=72 1=3 4=1 5=1 6=648 7=72 9=1 | |||
| Convolution 341 1 1 340 341 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1728 | |||
| BinaryOp 343 2 1 333_splitncnn_0 341 343 -23330=4,3,56,56,24 | |||
| Convolution 344 1 1 343 346 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1 | |||
| ConvolutionDepthWise 347 1 1 346 347 -23330=4,3,28,28,72 0=72 1=5 3=2 4=2 5=1 6=1800 7=72 | |||
| Split splitncnn_3 1 2 347 347_splitncnn_0 347_splitncnn_1 -23330=8,3,28,28,72,3,28,28,72 | |||
| Pooling 355 1 1 347_splitncnn_1 359 -23330=4,1,72,1,1 0=1 4=1 | |||
| InnerProduct 360 1 1 359 361 -23330=4,1,18,1,1 0=18 1=1 2=1296 9=1 | |||
| InnerProduct 362 1 1 361 362 -23330=4,1,72,1,1 0=72 1=1 2=1296 | |||
| HardSigmoid 367 1 1 362 367 -23330=4,1,72,1,1 | |||
| BinaryOp 376 2 1 347_splitncnn_0 367 376 -23330=4,3,28,28,72 0=2 | |||
| ReLU 377 1 1 376 377 -23330=4,3,28,28,72 | |||
| Convolution 378 1 1 377 378 -23330=4,3,28,28,40 0=40 1=1 5=1 6=2880 | |||
| Split splitncnn_4 1 2 378 378_splitncnn_0 378_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40 | |||
| Convolution 380 1 1 378_splitncnn_1 382 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1 | |||
| ConvolutionDepthWise 383 1 1 382 383 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120 | |||
| Split splitncnn_5 1 2 383 383_splitncnn_0 383_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120 | |||
| Pooling 391 1 1 383_splitncnn_1 395 -23330=4,1,120,1,1 0=1 4=1 | |||
| InnerProduct 396 1 1 395 397 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1 | |||
| InnerProduct 398 1 1 397 398 -23330=4,1,120,1,1 0=120 1=1 2=3600 | |||
| HardSigmoid 403 1 1 398 403 -23330=4,1,120,1,1 | |||
| BinaryOp 412 2 1 383_splitncnn_0 403 412 -23330=4,3,28,28,120 0=2 | |||
| ReLU 413 1 1 412 413 -23330=4,3,28,28,120 | |||
| Convolution 414 1 1 413 414 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800 | |||
| BinaryOp 416 2 1 378_splitncnn_0 414 416 -23330=4,3,28,28,40 | |||
| Split splitncnn_6 1 2 416 416_splitncnn_0 416_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40 | |||
| Convolution 417 1 1 416_splitncnn_1 419 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1 | |||
| ConvolutionDepthWise 420 1 1 419 420 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120 | |||
| Split splitncnn_7 1 2 420 420_splitncnn_0 420_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120 | |||
| Pooling 428 1 1 420_splitncnn_1 432 -23330=4,1,120,1,1 0=1 4=1 | |||
| InnerProduct 433 1 1 432 434 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1 | |||
| InnerProduct 435 1 1 434 435 -23330=4,1,120,1,1 0=120 1=1 2=3600 | |||
| HardSigmoid 440 1 1 435 440 -23330=4,1,120,1,1 | |||
| BinaryOp 449 2 1 420_splitncnn_0 440 449 -23330=4,3,28,28,120 0=2 | |||
| ReLU 450 1 1 449 450 -23330=4,3,28,28,120 | |||
| Convolution 451 1 1 450 451 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800 | |||
| BinaryOp 453 2 1 416_splitncnn_0 451 453 -23330=4,3,28,28,40 | |||
| Convolution 454 1 1 453 454 -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600 | |||
| HardSwish 461 1 1 454 461 -23330=4,3,28,28,240 | |||
| ConvolutionDepthWise 462 1 1 461 462 -23330=4,3,14,14,240 0=240 1=3 3=2 4=1 5=1 6=2160 7=240 | |||
| HardSwish 469 1 1 462 469 -23330=4,3,14,14,240 | |||
| Convolution 470 1 1 469 470 -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200 | |||
| Split splitncnn_8 1 2 470 470_splitncnn_0 470_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 | |||
| Convolution 472 1 1 470_splitncnn_1 472 -23330=4,3,14,14,200 0=200 1=1 5=1 6=16000 | |||
| HardSwish 479 1 1 472 479 -23330=4,3,14,14,200 | |||
| ConvolutionDepthWise 480 1 1 479 480 -23330=4,3,14,14,200 0=200 1=3 4=1 5=1 6=1800 7=200 | |||
| HardSwish 487 1 1 480 487 -23330=4,3,14,14,200 | |||
| Convolution 488 1 1 487 488 -23330=4,3,14,14,80 0=80 1=1 5=1 6=16000 | |||
| BinaryOp 490 2 1 470_splitncnn_0 488 490 -23330=4,3,14,14,80 | |||
| Split splitncnn_9 1 2 490 490_splitncnn_0 490_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 | |||
| Convolution 491 1 1 490_splitncnn_1 491 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720 | |||
| HardSwish 498 1 1 491 498 -23330=4,3,14,14,184 | |||
| ConvolutionDepthWise 499 1 1 498 499 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184 | |||
| HardSwish 506 1 1 499 506 -23330=4,3,14,14,184 | |||
| Convolution 507 1 1 506 507 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720 | |||
| BinaryOp 509 2 1 490_splitncnn_0 507 509 -23330=4,3,14,14,80 | |||
| Split splitncnn_10 1 2 509 509_splitncnn_0 509_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80 | |||
| Convolution 510 1 1 509_splitncnn_1 510 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720 | |||
| HardSwish 517 1 1 510 517 -23330=4,3,14,14,184 | |||
| ConvolutionDepthWise 518 1 1 517 518 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184 | |||
| HardSwish 525 1 1 518 525 -23330=4,3,14,14,184 | |||
| Convolution 526 1 1 525 526 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720 | |||
| BinaryOp 528 2 1 509_splitncnn_0 526 528 -23330=4,3,14,14,80 | |||
| Convolution 529 1 1 528 529 -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400 | |||
| HardSwish 536 1 1 529 536 -23330=4,3,14,14,480 | |||
| ConvolutionDepthWise 537 1 1 536 537 -23330=4,3,14,14,480 0=480 1=3 4=1 5=1 6=4320 7=480 | |||
| Split splitncnn_11 1 2 537 537_splitncnn_0 537_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480 | |||
| Pooling 545 1 1 537_splitncnn_1 549 -23330=4,1,480,1,1 0=1 4=1 | |||
| InnerProduct 550 1 1 549 551 -23330=4,1,120,1,1 0=120 1=1 2=57600 9=1 | |||
| InnerProduct 552 1 1 551 552 -23330=4,1,480,1,1 0=480 1=1 2=57600 | |||
| HardSigmoid 557 1 1 552 557 -23330=4,1,480,1,1 | |||
| BinaryOp 566 2 1 537_splitncnn_0 557 566 -23330=4,3,14,14,480 0=2 | |||
| HardSwish 572 1 1 566 572 -23330=4,3,14,14,480 | |||
| Convolution 573 1 1 572 573 -23330=4,3,14,14,112 0=112 1=1 5=1 6=53760 | |||
| Split splitncnn_12 1 2 573 573_splitncnn_0 573_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112 | |||
| Convolution 575 1 1 573_splitncnn_1 575 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264 | |||
| HardSwish 582 1 1 575 582 -23330=4,3,14,14,672 | |||
| ConvolutionDepthWise 583 1 1 582 583 -23330=4,3,14,14,672 0=672 1=3 4=1 5=1 6=6048 7=672 | |||
| Split splitncnn_13 1 2 583 583_splitncnn_0 583_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672 | |||
| Pooling 591 1 1 583_splitncnn_1 595 -23330=4,1,672,1,1 0=1 4=1 | |||
| InnerProduct 596 1 1 595 597 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1 | |||
| InnerProduct 598 1 1 597 598 -23330=4,1,672,1,1 0=672 1=1 2=112896 | |||
| HardSigmoid 603 1 1 598 603 -23330=4,1,672,1,1 | |||
| BinaryOp 612 2 1 583_splitncnn_0 603 612 -23330=4,3,14,14,672 0=2 | |||
| HardSwish 618 1 1 612 618 -23330=4,3,14,14,672 | |||
| Convolution 619 1 1 618 619 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264 | |||
| BinaryOp 621 2 1 573_splitncnn_0 619 621 -23330=4,3,14,14,112 | |||
| Convolution 622 1 1 621 622 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264 | |||
| HardSwish 629 1 1 622 629 -23330=4,3,14,14,672 | |||
| ConvolutionDepthWise 630 1 1 629 630 -23330=4,3,14,14,672 0=672 1=5 4=2 5=1 6=16800 7=672 | |||
| Split splitncnn_14 1 2 630 630_splitncnn_0 630_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672 | |||
| Pooling 638 1 1 630_splitncnn_1 642 -23330=4,1,672,1,1 0=1 4=1 | |||
| InnerProduct 643 1 1 642 644 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1 | |||
| InnerProduct 645 1 1 644 645 -23330=4,1,672,1,1 0=672 1=1 2=112896 | |||
| HardSigmoid 650 1 1 645 650 -23330=4,1,672,1,1 | |||
| BinaryOp 659 2 1 630_splitncnn_0 650 659 -23330=4,3,14,14,672 0=2 | |||
| HardSwish 665 1 1 659 665 -23330=4,3,14,14,672 | |||
| Convolution 666 1 1 665 666 -23330=4,3,14,14,160 0=160 1=1 5=1 6=107520 | |||
| Convolution 668 1 1 666 668 -23330=4,3,14,14,672 0=672 1=1 5=1 6=107520 | |||
| HardSwish 675 1 1 668 675 -23330=4,3,14,14,672 | |||
| ConvolutionDepthWise 676 1 1 675 676 -23330=4,3,7,7,672 0=672 1=5 3=2 4=2 5=1 6=16800 7=672 | |||
| Split splitncnn_15 1 2 676 676_splitncnn_0 676_splitncnn_1 -23330=8,3,7,7,672,3,7,7,672 | |||
| Pooling 684 1 1 676_splitncnn_1 688 -23330=4,1,672,1,1 0=1 4=1 | |||
| InnerProduct 689 1 1 688 690 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1 | |||
| InnerProduct 691 1 1 690 691 -23330=4,1,672,1,1 0=672 1=1 2=112896 | |||
| HardSigmoid 696 1 1 691 696 -23330=4,1,672,1,1 | |||
| BinaryOp 705 2 1 676_splitncnn_0 696 705 -23330=4,3,7,7,672 0=2 | |||
| HardSwish 711 1 1 705 711 -23330=4,3,7,7,672 | |||
| Convolution 712 1 1 711 712 -23330=4,3,7,7,160 0=160 1=1 5=1 6=107520 | |||
| Split splitncnn_16 1 2 712 712_splitncnn_0 712_splitncnn_1 -23330=8,3,7,7,160,3,7,7,160 | |||
| Convolution 714 1 1 712_splitncnn_1 714 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600 | |||
| HardSwish 721 1 1 714 721 -23330=4,3,7,7,960 | |||
| ConvolutionDepthWise 722 1 1 721 722 -23330=4,3,7,7,960 0=960 1=5 4=2 5=1 6=24000 7=960 | |||
| Split splitncnn_17 1 2 722 722_splitncnn_0 722_splitncnn_1 -23330=8,3,7,7,960,3,7,7,960 | |||
| Pooling 730 1 1 722_splitncnn_1 734 -23330=4,1,960,1,1 0=1 4=1 | |||
| InnerProduct 735 1 1 734 736 -23330=4,1,240,1,1 0=240 1=1 2=230400 9=1 | |||
| InnerProduct 737 1 1 736 737 -23330=4,1,960,1,1 0=960 1=1 2=230400 | |||
| HardSigmoid 742 1 1 737 742 -23330=4,1,960,1,1 | |||
| BinaryOp 751 2 1 722_splitncnn_0 742 751 -23330=4,3,7,7,960 0=2 | |||
| HardSwish 757 1 1 751 757 -23330=4,3,7,7,960 | |||
| Convolution 758 1 1 757 758 -23330=4,3,7,7,160 0=160 1=1 5=1 6=153600 | |||
| BinaryOp 760 2 1 712_splitncnn_0 758 760 -23330=4,3,7,7,160 | |||
| Convolution 761 1 1 760 761 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600 | |||
| HardSwish 768 1 1 761 768 -23330=4,3,7,7,960 | |||
| Pooling 769 1 1 768 769 -23330=4,1,960,1,1 0=1 4=1 | |||
| HardSwish 775 1 1 769 775 -23330=4,1,960,1,1 | |||
| Reshape 783 1 1 775 783 -23330=4,1,960,1,1 0=-1 | |||
| InnerProduct 784 1 1 783 784 -23330=4,1,1280,1,1 0=1280 1=1 2=1228800 | |||
| HardSwish 790 1 1 784 790 -23330=4,1,1280,1,1 | |||
| InnerProduct 791 1 1 790 791 -23330=4,1,1000,1,1 0=1000 1=1 2=1280000 | |||
| Softmax prob 1 1 791 output -23330=4,1,1000,1,1 | |||
| )delimiter"; | |||
| /** | |||
| * @brief 使用一个简单的 Sigmoid 网络预热并测试 Pipeline Cache 的基本保存和加载功能 | |||
| * @return 0 on success, -1 on failure | |||
| */ | |||
| static int warmup_gpu_pipecache() | |||
| { | |||
| std::cout << "==================================================" << std::endl; | |||
| std::cout << " Warmup: Testing Basic Cache IO " << std::endl; | |||
| std::cout << "==================================================" << std::endl; | |||
| // 1. 创建一个网络,运行一次以生成 pipeline | |||
| ncnn::Net net; | |||
| net.opt.use_vulkan_compute = true; | |||
| net.load_param_mem("7767517\n2 2\nInput input0 0 1 input0\nSigmoid sigmoid0 1 1 input0 output0"); | |||
| net.load_model((unsigned char*)""); // 用于创建 pipeline | |||
| ncnn::Mat input0 = RandomMat(224, 224); | |||
| ncnn::Mat output0; | |||
| { | |||
| ncnn::Extractor ex = net.create_extractor(); | |||
| ex.input("input0", input0); | |||
| ex.extract("output0", output0); | |||
| } | |||
| if (output0.empty()) | |||
| { | |||
| std::cerr << "Warmup failed: initial extraction failed." << std::endl; | |||
| return -1; | |||
| } | |||
| // 2. 保存 pipeline cache | |||
| const char* cache_path = "./sigmoid_pipecache.bin"; | |||
| if (net.opt.pipeline_cache->save_cache(cache_path) != 0) | |||
| { | |||
| std::cerr << "Warmup failed: could not save pipeline cache to " << cache_path << std::endl; | |||
| return -1; | |||
| } | |||
| std::cout << "Warmup: Pipeline cache saved successfully." << std::endl; | |||
| // 3. 创建第二个网络,加载刚才保存的 cache | |||
| ncnn::Net net2; | |||
| net2.opt.use_vulkan_compute = true; | |||
| net2.opt.pipeline_cache = new ncnn::PipelineCache(net.vulkan_device()); | |||
| net2.load_param_mem("7767517\n2 2\nInput input0 0 1 input0\nSigmoid sigmoid0 1 1 input0 output0"); | |||
| if (net2.opt.pipeline_cache->load_cache(cache_path) != 0) | |||
| { | |||
| std::cerr << "Warmup failed: could not load pipeline cache from " << cache_path << std::endl; | |||
| return -1; | |||
| } | |||
| std::cout << "Warmup: Pipeline cache loaded successfully." << std::endl; | |||
| net2.load_model((unsigned char*)""); // 创建 pipeline | |||
| // 4. 再次推理并验证结果是否一致 | |||
| ncnn::Mat output0_2; | |||
| { | |||
| ncnn::Extractor ex2 = net2.create_extractor(); | |||
| ex2.input("input0", input0); | |||
| ex2.extract("output0", output0_2); | |||
| } | |||
| if (output0_2.empty()) | |||
| { | |||
| std::cerr << "Warmup failed: extraction after loading cache failed." << std::endl; | |||
| return -1; | |||
| } | |||
| if (CompareMat(output0, output0_2, 0.001) != 0) | |||
| { | |||
| std::cerr << "Warmup failed: output mismatch after loading cache." << std::endl; | |||
| return -1; | |||
| } | |||
| std::cout << "Warmup PASSED: Outputs are identical." << std::endl; | |||
| return 0; | |||
| } | |||
| /** | |||
| * @brief 对比使用和不使用 Pipeline Cache 时的模型加载性能 | |||
| * @return 0 on success, -1 on failure | |||
| */ | |||
| static int test_gpu_pipecache_performance() | |||
| { | |||
| ncnn::Mat output_no_cache; | |||
| double time_no_cache = 0; | |||
| const char* cache_path = "./mobilenet_pipecache.bin"; | |||
| DataReaderFromEmpty dr; | |||
| ncnn::Mat input = RandomMat(224, 224, 3); | |||
| // ------------------------------------------------- | |||
| // 1. 不使用 Pipeline Cache (首次加载) | |||
| // ------------------------------------------------- | |||
| std::cout << "\n==================================================" << std::endl; | |||
| std::cout << " Performance Test: Without Pipeline Cache " << std::endl; | |||
| std::cout << "==================================================" << std::endl; | |||
| { | |||
| ncnn::Net net_no_cache; | |||
| net_no_cache.opt.use_vulkan_compute = true; | |||
| auto start = std::chrono::high_resolution_clock::now(); | |||
| net_no_cache.load_param_mem(mobilenet_v3_param); | |||
| net_no_cache.load_model(dr); | |||
| auto end = std::chrono::high_resolution_clock::now(); | |||
| time_no_cache = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(end - start).count(); | |||
| std::cout << "Model loading time without cache: " << time_no_cache << " ms" << std::endl; | |||
| // 推理以获得基准输出 | |||
| ncnn::Extractor ex = net_no_cache.create_extractor(); | |||
| ex.input("data", input); | |||
| ex.extract("output", output_no_cache); | |||
| if (output_no_cache.empty()) | |||
| { | |||
| std::cerr << "Test failed: extraction without cache failed." << std::endl; | |||
| return -1; | |||
| } | |||
| // 保存 cache 以供下一步使用 | |||
| if (net_no_cache.opt.pipeline_cache->save_cache(cache_path) != 0) | |||
| { | |||
| std::cerr << "Test failed: could not save pipeline cache to " << cache_path << std::endl; | |||
| return -1; | |||
| } | |||
| std::cout << "Pipeline cache generated and saved to " << cache_path << std::endl; | |||
| } | |||
| // ------------------------------------------------- | |||
| // 2. 使用 Pipeline Cache (二次加载) | |||
| // ------------------------------------------------- | |||
| ncnn::Mat output_with_cache; | |||
| double time_with_cache = 0; | |||
| std::cout << "\n==================================================" << std::endl; | |||
| std::cout << " Performance Test: With Pipeline Cache " << std::endl; | |||
| std::cout << "==================================================" << std::endl; | |||
| { | |||
| ncnn::Net net_with_cache; | |||
| // 必须在加载模型前设置好 cache | |||
| net_with_cache.opt.pipeline_cache = new ncnn::PipelineCache(ncnn::get_gpu_device()); | |||
| net_with_cache.opt.use_vulkan_compute = true; | |||
| auto start = std::chrono::high_resolution_clock::now(); | |||
| // 从文件加载 cache | |||
| if (net_with_cache.opt.pipeline_cache->load_cache(cache_path) != 0) | |||
| { | |||
| std::cerr << "Test failed: could not load pipeline cache from " << cache_path << std::endl; | |||
| return -1; | |||
| } | |||
| net_with_cache.load_param_mem(mobilenet_v3_param); | |||
| net_with_cache.load_model(dr); | |||
| auto end = std::chrono::high_resolution_clock::now(); | |||
| time_with_cache = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(end - start).count(); | |||
| std::cout << "Model loading time with cache: " << time_with_cache << " ms" << std::endl; | |||
| // 推理 | |||
| ncnn::Extractor ex2 = net_with_cache.create_extractor(); | |||
| ex2.input("data", input); | |||
| ex2.extract("output", output_with_cache); | |||
| if (output_with_cache.empty()) | |||
| { | |||
| std::cerr << "Test failed: extraction with cache failed." << std::endl; | |||
| return -1; | |||
| } | |||
| } | |||
| // ------------------------------------------------- | |||
| // 3. 结果验证与总结 | |||
| // ------------------------------------------------- | |||
| std::cout << "\n==================================================" << std::endl; | |||
| std::cout << " Verification and Summary " << std::endl; | |||
| std::cout << "==================================================" << std::endl; | |||
| bool is_output_same = (CompareMat(output_no_cache, output_with_cache, 0.001) == 0); | |||
| std::cout << "Output verification: " << (is_output_same ? "SUCCESS" : "FAILURE") << std::endl; | |||
| std::cout << "--------------------------------------------------" << std::endl; | |||
| std::cout << "Performance Summary:" << std::endl; | |||
| std::cout << " - Without Cache: " << time_no_cache << " ms" << std::endl; | |||
| std::cout << " - With Cache: " << time_with_cache << " ms" << std::endl; | |||
| if (time_no_cache > 0) { | |||
| double speedup = (time_no_cache - time_with_cache) / time_no_cache * 100; | |||
| std::cout << " - Speedup: " << speedup << "%" << std::endl; | |||
| } | |||
| if (!is_output_same) | |||
| { | |||
| std::cerr << "\nTest FAILED due to output mismatch." << std::endl; | |||
| return -1; | |||
| } | |||
| std::cout << "\nTest PASSED." << std::endl; | |||
| return 0; | |||
| } | |||
| int main() | |||
| { | |||
| // 运行预热测试,检查基本IO功能 | |||
| if (warmup_gpu_pipecache() != 0) | |||
| { | |||
| return -1; | |||
| } | |||
| // 运行性能对比测试 | |||
| return test_gpu_pipecache_performance(); | |||
| } | |||