// Copyright 2018 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "gpu.h" #if NCNN_VULKAN #include #include #include #include #include "glslang/SPIRV/GlslangToSpv.h" #if NCNN_SYSTEM_GLSLANG #include "glslang/Public/ShaderLang.h" #else #include "glslang/glslang/Public/ShaderLang.h" #endif #include "layer/vulkan/shader/vulkan_activation.comp.hex.h" #include "command.h" #include "layer.h" #include "layer_type.h" #include "mat.h" #include "pipelinecache.h" // There is known issue that vkDestroyDebugUtilsMessengerEXT crash on exit when vulkan validation layer enabled // upstream fix https://github.com/KhronosGroup/Vulkan-Loader/pull/539 #define ENABLE_VALIDATION_LAYER 0 namespace ncnn { // global static Mutex g_instance_lock; class __ncnn_vulkan_instance_holder { public: __ncnn_vulkan_instance_holder() { instance = 0; instance_api_version = 0; created = 0; glslang_initialized = false; #if NCNN_VULKAN_LOADER libvulkan = 0; #if defined __ANDROID__ hvkdi = 0; #endif #endif // NCNN_VULKAN_LOADER #if ENABLE_VALIDATION_LAYER callback = 0; #endif } ~__ncnn_vulkan_instance_holder() { destroy_gpu_instance(); } operator VkInstance() { return instance; } VkInstance instance; uint32_t instance_api_version; int created; bool glslang_initialized; #if ENABLE_VALIDATION_LAYER VkDebugUtilsMessengerEXT callback; #endif }; static __ncnn_vulkan_instance_holder g_instance; static int g_gpu_count = 0; static int g_default_gpu_index = -1; // NOTE 32 is large enough i think ... #define NCNN_MAX_GPU_COUNT 32 static GpuInfo* g_gpu_infos[NCNN_MAX_GPU_COUNT] = {0}; // default vulkan device static Mutex g_default_vkdev_lock; static VulkanDevice* g_default_vkdev[NCNN_MAX_GPU_COUNT] = {0}; struct layer_shader_registry_entry { const char* comp_data; int comp_data_size; }; #include "layer_shader_spv_data.h" static const layer_shader_registry_entry layer_shader_registry[] = { #include "layer_shader_registry.h" }; static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry); // vulkan core PFN_vkAllocateCommandBuffers vkAllocateCommandBuffers = 0; PFN_vkAllocateDescriptorSets vkAllocateDescriptorSets = 0; PFN_vkAllocateMemory vkAllocateMemory = 0; PFN_vkBeginCommandBuffer vkBeginCommandBuffer = 0; PFN_vkBindBufferMemory vkBindBufferMemory = 0; PFN_vkBindImageMemory vkBindImageMemory = 0; PFN_vkCmdBeginQuery vkCmdBeginQuery = 0; PFN_vkCmdBindDescriptorSets vkCmdBindDescriptorSets = 0; PFN_vkCmdBindIndexBuffer vkCmdBindIndexBuffer = 0; PFN_vkCmdBindPipeline vkCmdBindPipeline = 0; PFN_vkCmdCopyBuffer vkCmdCopyBuffer = 0; PFN_vkCmdCopyBufferToImage vkCmdCopyBufferToImage = 0; PFN_vkCmdCopyImage vkCmdCopyImage = 0; PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer = 0; PFN_vkCmdCopyQueryPoolResults vkCmdCopyQueryPoolResults = 0; PFN_vkCmdDispatch vkCmdDispatch = 0; PFN_vkCmdDispatchIndirect vkCmdDispatchIndirect = 0; PFN_vkCmdEndQuery vkCmdEndQuery = 0; PFN_vkCmdExecuteCommands vkCmdExecuteCommands = 0; PFN_vkCmdFillBuffer vkCmdFillBuffer = 0; PFN_vkCmdPipelineBarrier vkCmdPipelineBarrier = 0; PFN_vkCmdPushConstants vkCmdPushConstants = 0; PFN_vkCmdResetQueryPool vkCmdResetQueryPool = 0; PFN_vkCmdResolveImage vkCmdResolveImage = 0; PFN_vkCmdUpdateBuffer vkCmdUpdateBuffer = 0; PFN_vkCmdWriteTimestamp vkCmdWriteTimestamp = 0; PFN_vkCreateBuffer vkCreateBuffer = 0; PFN_vkCreateBufferView vkCreateBufferView = 0; PFN_vkCreateCommandPool vkCreateCommandPool = 0; PFN_vkCreateComputePipelines vkCreateComputePipelines = 0; PFN_vkCreateDescriptorPool vkCreateDescriptorPool = 0; PFN_vkCreateDescriptorSetLayout vkCreateDescriptorSetLayout = 0; PFN_vkCreateDevice vkCreateDevice = 0; PFN_vkCreateFence vkCreateFence = 0; PFN_vkCreateImage vkCreateImage = 0; PFN_vkCreateImageView vkCreateImageView = 0; PFN_vkCreatePipelineCache vkCreatePipelineCache = 0; PFN_vkCreatePipelineLayout vkCreatePipelineLayout = 0; PFN_vkCreateQueryPool vkCreateQueryPool = 0; PFN_vkCreateSampler vkCreateSampler = 0; PFN_vkCreateSemaphore vkCreateSemaphore = 0; PFN_vkCreateShaderModule vkCreateShaderModule = 0; PFN_vkDestroyBuffer vkDestroyBuffer = 0; PFN_vkDestroyBufferView vkDestroyBufferView = 0; PFN_vkDestroyCommandPool vkDestroyCommandPool = 0; PFN_vkDestroyDescriptorPool vkDestroyDescriptorPool = 0; PFN_vkDestroyDescriptorSetLayout vkDestroyDescriptorSetLayout = 0; PFN_vkDestroyDevice vkDestroyDevice = 0; PFN_vkDestroyFence vkDestroyFence = 0; PFN_vkDestroyImage vkDestroyImage = 0; PFN_vkDestroyImageView vkDestroyImageView = 0; PFN_vkDestroyInstance vkDestroyInstance = 0; PFN_vkDestroyPipeline vkDestroyPipeline = 0; PFN_vkDestroyPipelineCache vkDestroyPipelineCache = 0; PFN_vkDestroyPipelineLayout vkDestroyPipelineLayout = 0; PFN_vkDestroyQueryPool vkDestroyQueryPool = 0; PFN_vkDestroySampler vkDestroySampler = 0; PFN_vkDestroySemaphore vkDestroySemaphore = 0; PFN_vkDestroyShaderModule vkDestroyShaderModule = 0; PFN_vkDeviceWaitIdle vkDeviceWaitIdle = 0; PFN_vkEndCommandBuffer vkEndCommandBuffer = 0; PFN_vkEnumerateDeviceExtensionProperties vkEnumerateDeviceExtensionProperties = 0; PFN_vkEnumerateDeviceLayerProperties vkEnumerateDeviceLayerProperties = 0; PFN_vkEnumeratePhysicalDevices vkEnumeratePhysicalDevices = 0; PFN_vkFlushMappedMemoryRanges vkFlushMappedMemoryRanges = 0; PFN_vkFreeCommandBuffers vkFreeCommandBuffers = 0; PFN_vkFreeDescriptorSets vkFreeDescriptorSets = 0; PFN_vkFreeMemory vkFreeMemory = 0; PFN_vkGetBufferMemoryRequirements vkGetBufferMemoryRequirements = 0; PFN_vkGetDeviceMemoryCommitment vkGetDeviceMemoryCommitment = 0; PFN_vkGetDeviceProcAddr vkGetDeviceProcAddr = 0; PFN_vkGetDeviceQueue vkGetDeviceQueue = 0; PFN_vkGetFenceStatus vkGetFenceStatus = 0; PFN_vkGetImageMemoryRequirements vkGetImageMemoryRequirements = 0; PFN_vkGetImageSubresourceLayout vkGetImageSubresourceLayout = 0; PFN_vkGetPhysicalDeviceFeatures vkGetPhysicalDeviceFeatures = 0; PFN_vkGetPhysicalDeviceFormatProperties vkGetPhysicalDeviceFormatProperties = 0; PFN_vkGetPhysicalDeviceImageFormatProperties vkGetPhysicalDeviceImageFormatProperties = 0; PFN_vkGetPhysicalDeviceMemoryProperties vkGetPhysicalDeviceMemoryProperties = 0; PFN_vkGetPhysicalDeviceProperties vkGetPhysicalDeviceProperties = 0; PFN_vkGetPhysicalDeviceQueueFamilyProperties vkGetPhysicalDeviceQueueFamilyProperties = 0; PFN_vkGetPipelineCacheData vkGetPipelineCacheData = 0; PFN_vkGetQueryPoolResults vkGetQueryPoolResults = 0; PFN_vkInvalidateMappedMemoryRanges vkInvalidateMappedMemoryRanges = 0; PFN_vkMapMemory vkMapMemory = 0; PFN_vkMergePipelineCaches vkMergePipelineCaches = 0; PFN_vkQueueSubmit vkQueueSubmit = 0; PFN_vkQueueWaitIdle vkQueueWaitIdle = 0; PFN_vkResetCommandBuffer vkResetCommandBuffer = 0; PFN_vkResetCommandPool vkResetCommandPool = 0; PFN_vkResetDescriptorPool vkResetDescriptorPool = 0; PFN_vkResetFences vkResetFences = 0; PFN_vkUnmapMemory vkUnmapMemory = 0; PFN_vkUpdateDescriptorSets vkUpdateDescriptorSets = 0; PFN_vkWaitForFences vkWaitForFences = 0; int support_VK_KHR_external_memory_capabilities = 0; int support_VK_KHR_get_physical_device_properties2 = 0; int support_VK_KHR_get_surface_capabilities2 = 0; int support_VK_KHR_portability_enumeration = 0; int support_VK_KHR_surface = 0; int support_VK_EXT_debug_utils = 0; int support_VK_EXT_validation_features = 0; int support_VK_EXT_validation_flags = 0; #if __ANDROID_API__ >= 26 int support_VK_KHR_android_surface = 0; #endif // __ANDROID_API__ >= 26 // VK_KHR_cooperative_matrix PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = 0; // VK_KHR_external_memory_capabilities PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR = 0; // VK_KHR_get_physical_device_properties2 PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR = 0; PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR = 0; PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR = 0; PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR = 0; PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR = 0; PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR = 0; // VK_KHR_get_surface_capabilities2 PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR = 0; PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR = 0; // VK_KHR_surface PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR = 0; PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR = 0; PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR = 0; PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR = 0; PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR = 0; #if __ANDROID_API__ >= 26 // VK_KHR_android_surface PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR = 0; #endif // __ANDROID_API__ >= 26 // VK_NV_cooperative_matrix PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV vkGetPhysicalDeviceCooperativeMatrixPropertiesNV = 0; // VK_NV_cooperative_matrix2 PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV = 0; // VK_NV_cooperative_vector PFN_vkGetPhysicalDeviceCooperativeVectorPropertiesNV vkGetPhysicalDeviceCooperativeVectorPropertiesNV = 0; class GpuInfoPrivate { public: void query_features(); void query_properties(); void query_queue_properties(); int query_extensions(); void query_extension_features(); void query_extension_properties(); public: int device_index; // physical device VkPhysicalDevice physicalDevice; // features VkPhysicalDeviceFeatures physicalDevicefeatures; // properties VkPhysicalDeviceProperties physicalDeviceProperties; // memory properties VkPhysicalDeviceMemoryProperties physicalDeviceMemoryProperties; // extension properties std::vector deviceExtensionProperties; // 0 = discrete gpu // 1 = integrated gpu // 2 = virtual gpu // 3 = cpu int type; // runtime uint32_t compute_queue_family_index; uint32_t transfer_queue_family_index; uint32_t compute_queue_count; uint32_t transfer_queue_count; // property bool unified_compute_transfer_queue; // bug is not feature bool bug_storage_buffer_no_l1; bool bug_corrupted_online_pipeline_cache; bool bug_buffer_image_load_zero; // but sometimes bug is a feature bool bug_implicit_fp16_arithmetic; // cooperative matrix bool support_cooperative_matrix_8_8_16; bool support_cooperative_matrix_16_8_8; bool support_cooperative_matrix_16_8_16; bool support_cooperative_matrix_16_16_16; // extension capability int support_VK_KHR_8bit_storage; int support_VK_KHR_16bit_storage; int support_VK_KHR_bind_memory2; int support_VK_KHR_buffer_device_address; int support_VK_KHR_create_renderpass2; int support_VK_KHR_cooperative_matrix; int support_VK_KHR_dedicated_allocation; int support_VK_KHR_descriptor_update_template; int support_VK_KHR_driver_properties; int support_VK_KHR_external_memory; int support_VK_KHR_get_memory_requirements2; int support_VK_KHR_maintenance1; int support_VK_KHR_maintenance2; int support_VK_KHR_maintenance3; int support_VK_KHR_multiview; int support_VK_KHR_portability_subset; int support_VK_KHR_push_descriptor; int support_VK_KHR_sampler_ycbcr_conversion; int support_VK_KHR_shader_bfloat16; int support_VK_KHR_shader_float16_int8; int support_VK_KHR_shader_float_controls; int support_VK_KHR_shader_float_controls2; int support_VK_KHR_shader_integer_dot_product; int support_VK_KHR_shader_non_semantic_info; int support_VK_KHR_shader_subgroup_extended_types; int support_VK_KHR_shader_subgroup_rotate; int support_VK_KHR_storage_buffer_storage_class; int support_VK_KHR_swapchain; int support_VK_KHR_vulkan_memory_model; int support_VK_KHR_zero_initialize_workgroup_memory; int support_VK_EXT_buffer_device_address; int support_VK_EXT_descriptor_indexing; int support_VK_EXT_memory_budget; int support_VK_EXT_memory_priority; int support_VK_EXT_queue_family_foreign; int support_VK_EXT_shader_atomic_float; int support_VK_EXT_shader_atomic_float2; int support_VK_EXT_shader_float8; int support_VK_EXT_subgroup_size_control; int support_VK_AMD_device_coherent_memory; #if __ANDROID_API__ >= 26 int support_VK_ANDROID_external_memory_android_hardware_buffer; #endif // __ANDROID_API__ >= 26 int support_VK_NV_cooperative_matrix; int support_VK_NV_cooperative_matrix2; int support_VK_NV_cooperative_vector; // extension features void* queryExtensionFeatures; VkPhysicalDevice8BitStorageFeaturesKHR query8BitStorageFeatures; VkPhysicalDevice16BitStorageFeaturesKHR query16BitStorageFeatures; VkPhysicalDeviceFloat16Int8FeaturesKHR queryFloat16Int8Features; VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures; VkPhysicalDeviceCooperativeMatrixFeaturesKHR queryCooperativeMatrixFeatures; VkPhysicalDeviceCooperativeMatrixFeaturesNV queryCooperativeMatrixFeaturesNV; VkPhysicalDeviceCooperativeMatrix2FeaturesNV queryCooperativeMatrix2FeaturesNV; VkPhysicalDeviceShaderBfloat16FeaturesKHR queryShaderBfloat16Features; VkPhysicalDeviceShaderFloat8FeaturesEXT queryShaderFloat8Features; VkPhysicalDeviceShaderFloatControls2FeaturesKHR queryShaderFloatControls2Features; VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR queryShaderIntegerDotProductFeatures; VkPhysicalDeviceSubgroupSizeControlFeaturesEXT querySubgroupSizeControlFeatures; VkPhysicalDeviceShaderSubgroupRotateFeaturesKHR queryShaderSubgroupRotateFeatures; VkPhysicalDeviceShaderAtomicFloatFeaturesEXT queryShaderAtomicFloatFeatures; VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT queryShaderAtomicFloat2Features; VkPhysicalDeviceCooperativeVectorFeaturesNV queryCooperativeVectorFeaturesNV; VkPhysicalDeviceVulkanMemoryModelFeaturesKHR queryVulkanMemoryModelFeatures; // extension properties void* queryExtensionProperties; VkPhysicalDeviceFloatControlsPropertiesKHR queryFloatControlsProperties; VkPhysicalDeviceShaderIntegerDotProductProperties queryShaderIntegerDotProductProperties; VkPhysicalDeviceSubgroupProperties querySubgroupProperties; VkPhysicalDeviceDriverPropertiesKHR queryDriverProperties; VkPhysicalDeviceSubgroupSizeControlPropertiesEXT querySubgroupSizeControlProperties; VkPhysicalDeviceCooperativeMatrix2PropertiesNV queryCooperativeMatrix2PropertiesNV; VkPhysicalDeviceCooperativeVectorPropertiesNV queryCooperativeVectorPropertiesNV; // extension sub properties std::vector queryCooperativeMatrixSubProperties; std::vector queryCooperativeMatrixSubPropertiesNV; std::vector queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV; std::vector queryCooperativeVectorSubPropertiesNV; }; void GpuInfoPrivate::query_features() { vkGetPhysicalDeviceFeatures(physicalDevice, &physicalDevicefeatures); } void GpuInfoPrivate::query_properties() { vkGetPhysicalDeviceProperties(physicalDevice, &physicalDeviceProperties); // NCNN_LOGE("[%u] apiVersion = %u.%u.%u", i, VK_VERSION_MAJOR(physicalDeviceProperties.apiVersion), // VK_VERSION_MINOR(physicalDeviceProperties.apiVersion), VK_VERSION_PATCH(physicalDeviceProperties.apiVersion)); // NCNN_LOGE("[%u] driverVersion = %u.%u.%u", i, VK_VERSION_MAJOR(physicalDeviceProperties.driverVersion), // VK_VERSION_MINOR(physicalDeviceProperties.driverVersion), VK_VERSION_PATCH(physicalDeviceProperties.driverVersion)); // NCNN_LOGE("[%u] vendorID = %x", i, physicalDeviceProperties.vendorID); // NCNN_LOGE("[%u] deviceID = %x", i, physicalDeviceProperties.deviceID); // NCNN_LOGE("[%u] deviceType = %x", i, physicalDeviceProperties.deviceType); // NCNN_LOGE("[%u] deviceName = %s", i, physicalDeviceProperties.deviceName); // NCNN_LOGE("[%u] pipelineCacheUUID = %u", i, physicalDeviceProperties.pipelineCacheUUID); // device type { type = -1; if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) type = 0; if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) type = 1; if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU) type = 2; if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_CPU) type = 3; } // mali // t760 = 0x13b5 0x7500001 / 0x7501000 // t860 = 0x13b5 0x8602000 // t880 = 0x13b5 0x8800020 // g31 = 0x13b5 0x70930000 // g51 = 0x13b5 0x70901010 // g52 = 0x13b5 0x74021000 / 0x72120000 // g71 = 0x13b5 0x60a00002 // g72 = 0x13b5 0x62210001 // g76 = 0x13b5 0x72110000 // g77 = 0x13b5 0x90800011 // adreno // 506 = 0x5143 0x5000600 // 510 = 0x5143 0x5010000 // 512 = 0x5143 0x5010200 // 530 = 0x5143 0x5030004 // 540 = 0x5143 0x5040001 // 616 = 0x5143 0x6010600 // 630 = 0x5143 0x6030001 // 640 = 0x5143 0x6040001 // 650 = 0x5143 0x6050002 bug_storage_buffer_no_l1 = false; bug_corrupted_online_pipeline_cache = false; bug_implicit_fp16_arithmetic = false; bug_buffer_image_load_zero = false; if (physicalDeviceProperties.vendorID == 0x5143 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 0, 66)) { // qcom adreno with old buggy driver cannot share created pipeline properly bug_corrupted_online_pipeline_cache = true; } if (physicalDeviceProperties.vendorID == 0x5143 && !(physicalDeviceProperties.deviceID == 0x6040001 || physicalDeviceProperties.deviceID == 0x6050002)) { // NOTE but qcom855/qcom855plus/qcom865 are known exceptions // qcom adreno storage buffer without L1 cache bug_storage_buffer_no_l1 = true; } if (physicalDeviceProperties.vendorID == 0x5143 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 1, 87)) { // HACK buffer2image before image-read dependency does not work properly // even promised with full image memory barrier on old adreno driver // TODO figure out a proper workaround without hurt speed too much // TODO only for old drivers bug_buffer_image_load_zero = true; } if (physicalDeviceProperties.vendorID == 0x13b5 && (physicalDeviceProperties.deviceID == 0x7500001 || physicalDeviceProperties.deviceID == 0x7501000 || physicalDeviceProperties.deviceID == 0x8602000 || physicalDeviceProperties.deviceID == 0x8800020 || physicalDeviceProperties.deviceID == 0x70930000 || physicalDeviceProperties.deviceID == 0x70901010 || physicalDeviceProperties.deviceID == 0x72120000 || physicalDeviceProperties.deviceID == 0x74021000 || physicalDeviceProperties.deviceID == 0x60a00002 || physicalDeviceProperties.deviceID == 0x62210001)) { // NOTE rk3288/rk3399/t880/g31/g51/g52/g71/g72 // however, g76/g77 has explicit fp16 arithmetic // arm mali driver accept spirv with fp16 arithmetic bug_implicit_fp16_arithmetic = true; } if (physicalDeviceProperties.vendorID == 0x5143 && (physicalDeviceProperties.deviceID == 0x6030001 || physicalDeviceProperties.deviceID == 0x6040001 || physicalDeviceProperties.deviceID == 0x6050002)) { // TODO enable devices other than qcom845/qcom855/qcom855plus/qcom865 // qcom adreno driver accept spirv with fp16 arithmetic bug_implicit_fp16_arithmetic = true; } } static uint32_t find_device_compute_queue(const std::vector& queueFamilyProperties) { // first try, compute only queue for (uint32_t i = 0; i < queueFamilyProperties.size(); i++) { const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i]; if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT) && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)) { return i; } } // second try, any queue with compute and graphics for (uint32_t i = 0; i < queueFamilyProperties.size(); i++) { const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i]; if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT) && (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)) { return i; } } // third try, any queue with compute for (uint32_t i = 0; i < queueFamilyProperties.size(); i++) { const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i]; if (queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT) { return i; } } // NCNN_LOGE("no compute queue"); return -1; } static uint32_t find_device_transfer_queue(const std::vector& queueFamilyProperties) { // first try, transfer only queue for (uint32_t i = 0; i < queueFamilyProperties.size(); i++) { const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i]; if ((queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT) && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT) && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)) { return i; } } // second try, any queue with transfer for (uint32_t i = 0; i < queueFamilyProperties.size(); i++) { const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i]; if (queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT) { return i; } } // third try, use compute queue uint32_t compute_queue_index = find_device_compute_queue(queueFamilyProperties); if (compute_queue_index != (uint32_t)-1) { return compute_queue_index; } // NCNN_LOGE("no transfer queue"); return -1; } void GpuInfoPrivate::query_queue_properties() { // find compute queue uint32_t queueFamilyPropertiesCount; vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, 0); std::vector queueFamilyProperties(queueFamilyPropertiesCount); vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, queueFamilyProperties.data()); compute_queue_family_index = find_device_compute_queue(queueFamilyProperties); transfer_queue_family_index = find_device_transfer_queue(queueFamilyProperties); compute_queue_count = queueFamilyProperties[compute_queue_family_index].queueCount; transfer_queue_count = queueFamilyProperties[transfer_queue_family_index].queueCount; unified_compute_transfer_queue = compute_queue_family_index == transfer_queue_family_index; } int GpuInfoPrivate::query_extensions() { // get device extension uint32_t deviceExtensionPropertyCount = 0; VkResult ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, NULL); if (ret != VK_SUCCESS) { NCNN_LOGE("vkEnumerateDeviceExtensionProperties failed %d", ret); return -1; } deviceExtensionProperties.resize(deviceExtensionPropertyCount); ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, deviceExtensionProperties.data()); if (ret != VK_SUCCESS) { NCNN_LOGE("vkEnumerateDeviceExtensionProperties failed %d", ret); return -1; } // extension capability support_VK_KHR_8bit_storage = 0; support_VK_KHR_16bit_storage = 0; support_VK_KHR_bind_memory2 = 0; support_VK_KHR_buffer_device_address = 0; support_VK_KHR_create_renderpass2 = 0; support_VK_KHR_cooperative_matrix = 0; support_VK_KHR_dedicated_allocation = 0; support_VK_KHR_descriptor_update_template = 0; support_VK_KHR_driver_properties = 0; support_VK_KHR_external_memory = 0; support_VK_KHR_get_memory_requirements2 = 0; support_VK_KHR_maintenance1 = 0; support_VK_KHR_maintenance2 = 0; support_VK_KHR_maintenance3 = 0; support_VK_KHR_multiview = 0; support_VK_KHR_portability_subset = 0; support_VK_KHR_push_descriptor = 0; support_VK_KHR_sampler_ycbcr_conversion = 0; support_VK_KHR_shader_bfloat16 = 0; support_VK_KHR_shader_float16_int8 = 0; support_VK_KHR_shader_float_controls = 0; support_VK_KHR_shader_float_controls2 = 0; support_VK_KHR_shader_integer_dot_product = 0; support_VK_KHR_shader_non_semantic_info = 0; support_VK_KHR_shader_subgroup_extended_types = 0; support_VK_KHR_shader_subgroup_rotate = 0; support_VK_KHR_storage_buffer_storage_class = 0; support_VK_KHR_swapchain = 0; support_VK_KHR_vulkan_memory_model = 0; support_VK_KHR_zero_initialize_workgroup_memory = 0; support_VK_EXT_buffer_device_address = 0; support_VK_EXT_descriptor_indexing = 0; support_VK_EXT_memory_budget = 0; support_VK_EXT_memory_priority = 0; support_VK_EXT_queue_family_foreign = 0; support_VK_EXT_shader_atomic_float = 0; support_VK_EXT_shader_atomic_float2 = 0; support_VK_EXT_shader_float8 = 0; support_VK_EXT_subgroup_size_control = 0; support_VK_AMD_device_coherent_memory = 0; #if __ANDROID_API__ >= 26 support_VK_ANDROID_external_memory_android_hardware_buffer = 0; #endif // __ANDROID_API__ >= 26 support_VK_NV_cooperative_matrix = 0; support_VK_NV_cooperative_matrix2 = 0; support_VK_NV_cooperative_vector = 0; for (uint32_t j = 0; j < deviceExtensionPropertyCount; j++) { const VkExtensionProperties& exp = deviceExtensionProperties[j]; // NCNN_LOGE("device extension %s = %u", exp.extensionName, exp.specVersion); if (strcmp(exp.extensionName, "VK_KHR_8bit_storage") == 0) support_VK_KHR_8bit_storage = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_16bit_storage") == 0) support_VK_KHR_16bit_storage = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_bind_memory2") == 0) support_VK_KHR_bind_memory2 = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_buffer_device_address") == 0) support_VK_KHR_buffer_device_address = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_create_renderpass2") == 0) support_VK_KHR_create_renderpass2 = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_cooperative_matrix") == 0) support_VK_KHR_cooperative_matrix = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_dedicated_allocation") == 0) support_VK_KHR_dedicated_allocation = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_descriptor_update_template") == 0) support_VK_KHR_descriptor_update_template = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_driver_properties") == 0) support_VK_KHR_driver_properties = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_external_memory") == 0) support_VK_KHR_external_memory = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_get_memory_requirements2") == 0) support_VK_KHR_get_memory_requirements2 = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_maintenance1") == 0) support_VK_KHR_maintenance1 = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_maintenance2") == 0) support_VK_KHR_maintenance2 = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_maintenance3") == 0) support_VK_KHR_maintenance3 = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_multiview") == 0) support_VK_KHR_multiview = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_portability_subset") == 0) support_VK_KHR_portability_subset = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_push_descriptor") == 0) support_VK_KHR_push_descriptor = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_sampler_ycbcr_conversion") == 0) support_VK_KHR_sampler_ycbcr_conversion = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_shader_bfloat16") == 0) support_VK_KHR_shader_bfloat16 = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_shader_float16_int8") == 0) support_VK_KHR_shader_float16_int8 = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_shader_float_controls") == 0) support_VK_KHR_shader_float_controls = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_shader_float_controls2") == 0) support_VK_KHR_shader_float_controls2 = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_shader_integer_dot_product") == 0) support_VK_KHR_shader_integer_dot_product = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_shader_non_semantic_info") == 0) support_VK_KHR_shader_non_semantic_info = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_shader_subgroup_extended_types") == 0) support_VK_KHR_shader_subgroup_extended_types = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_shader_subgroup_rotate") == 0) support_VK_KHR_shader_subgroup_rotate = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_storage_buffer_storage_class") == 0) support_VK_KHR_storage_buffer_storage_class = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_swapchain") == 0) support_VK_KHR_swapchain = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_vulkan_memory_model") == 0) support_VK_KHR_vulkan_memory_model = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_zero_initialize_workgroup_memory") == 0) support_VK_KHR_zero_initialize_workgroup_memory = exp.specVersion; else if (strcmp(exp.extensionName, "VK_EXT_buffer_device_address") == 0) support_VK_EXT_buffer_device_address = exp.specVersion; else if (strcmp(exp.extensionName, "VK_EXT_descriptor_indexing") == 0) support_VK_EXT_descriptor_indexing = exp.specVersion; else if (strcmp(exp.extensionName, "VK_EXT_memory_budget") == 0) support_VK_EXT_memory_budget = exp.specVersion; else if (strcmp(exp.extensionName, "VK_EXT_memory_priority") == 0) support_VK_EXT_memory_priority = exp.specVersion; else if (strcmp(exp.extensionName, "VK_EXT_queue_family_foreign") == 0) support_VK_EXT_queue_family_foreign = exp.specVersion; else if (strcmp(exp.extensionName, "VK_EXT_shader_atomic_float") == 0) support_VK_EXT_shader_atomic_float = exp.specVersion; else if (strcmp(exp.extensionName, "VK_EXT_shader_atomic_float2") == 0) support_VK_EXT_shader_atomic_float2 = exp.specVersion; else if (strcmp(exp.extensionName, "VK_EXT_shader_float8") == 0) support_VK_EXT_shader_float8 = exp.specVersion; else if (strcmp(exp.extensionName, "VK_EXT_subgroup_size_control") == 0) support_VK_EXT_subgroup_size_control = exp.specVersion; else if (strcmp(exp.extensionName, "VK_AMD_device_coherent_memory") == 0) support_VK_AMD_device_coherent_memory = exp.specVersion; #if __ANDROID_API__ >= 26 else if (strcmp(exp.extensionName, "VK_ANDROID_external_memory_android_hardware_buffer") == 0) support_VK_ANDROID_external_memory_android_hardware_buffer = exp.specVersion; #endif // __ANDROID_API__ >= 26 else if (strcmp(exp.extensionName, "VK_NV_cooperative_matrix") == 0) support_VK_NV_cooperative_matrix = exp.specVersion; else if (strcmp(exp.extensionName, "VK_NV_cooperative_matrix2") == 0) support_VK_NV_cooperative_matrix2 = exp.specVersion; else if (strcmp(exp.extensionName, "VK_NV_cooperative_vector") == 0) support_VK_NV_cooperative_vector = exp.specVersion; } if (support_VK_KHR_buffer_device_address) { // we prefer khr extension support_VK_EXT_buffer_device_address = 0; } if (support_VK_KHR_cooperative_matrix) { // we prefer khr extension support_VK_NV_cooperative_matrix = 0; } return 0; } void GpuInfoPrivate::query_extension_features() { queryExtensionFeatures = 0; // query int8 storage memset(&query8BitStorageFeatures, 0, sizeof(query8BitStorageFeatures)); query8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR; query8BitStorageFeatures.pNext = 0; if (support_VK_KHR_8bit_storage) { query8BitStorageFeatures.pNext = queryExtensionFeatures; queryExtensionFeatures = &query8BitStorageFeatures; } // query fp16/int16 storage memset(&query16BitStorageFeatures, 0, sizeof(query16BitStorageFeatures)); query16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR; query16BitStorageFeatures.pNext = 0; if (support_VK_KHR_16bit_storage) { query16BitStorageFeatures.pNext = queryExtensionFeatures; queryExtensionFeatures = &query16BitStorageFeatures; } // query fp16/int8 arithmetic memset(&queryFloat16Int8Features, 0, sizeof(queryFloat16Int8Features)); queryFloat16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR; queryFloat16Int8Features.pNext = 0; if (support_VK_KHR_shader_float16_int8) { queryFloat16Int8Features.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryFloat16Int8Features; } // query ycbcr_conversion memset(&querySamplerYcbcrConversionFeatures, 0, sizeof(querySamplerYcbcrConversionFeatures)); querySamplerYcbcrConversionFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES_KHR; querySamplerYcbcrConversionFeatures.pNext = 0; if (support_VK_KHR_sampler_ycbcr_conversion) { querySamplerYcbcrConversionFeatures.pNext = queryExtensionFeatures; queryExtensionFeatures = &querySamplerYcbcrConversionFeatures; } // query cooperative_matrix memset(&queryCooperativeMatrixFeatures, 0, sizeof(queryCooperativeMatrixFeatures)); queryCooperativeMatrixFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; queryCooperativeMatrixFeatures.pNext = 0; if (support_VK_KHR_cooperative_matrix) { queryCooperativeMatrixFeatures.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryCooperativeMatrixFeatures; } // query nv cooperative matrix memset(&queryCooperativeMatrixFeaturesNV, 0, sizeof(queryCooperativeMatrixFeaturesNV)); queryCooperativeMatrixFeaturesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV; queryCooperativeMatrixFeaturesNV.pNext = 0; if (support_VK_NV_cooperative_matrix) { queryCooperativeMatrixFeaturesNV.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryCooperativeMatrixFeaturesNV; } // query nv cooperative matrix2 memset(&queryCooperativeMatrix2FeaturesNV, 0, sizeof(queryCooperativeMatrix2FeaturesNV)); queryCooperativeMatrix2FeaturesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_FEATURES_NV; queryCooperativeMatrix2FeaturesNV.pNext = 0; if (support_VK_NV_cooperative_matrix2) { queryCooperativeMatrix2FeaturesNV.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryCooperativeMatrix2FeaturesNV; } // query bfloat16 memset(&queryShaderBfloat16Features, 0, sizeof(queryShaderBfloat16Features)); queryShaderBfloat16Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR; queryShaderBfloat16Features.pNext = 0; if (support_VK_KHR_shader_bfloat16) { queryShaderBfloat16Features.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryShaderBfloat16Features; } // query float8 memset(&queryShaderFloat8Features, 0, sizeof(queryShaderFloat8Features)); queryShaderFloat8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT8_FEATURES_EXT; queryShaderFloat8Features.pNext = 0; if (support_VK_EXT_shader_float8) { queryShaderFloat8Features.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryShaderFloat8Features; } // query float controls 2 memset(&queryShaderFloatControls2Features, 0, sizeof(queryShaderFloatControls2Features)); queryShaderFloatControls2Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT_CONTROLS_2_FEATURES_KHR; queryShaderFloatControls2Features.pNext = 0; if (support_VK_KHR_shader_float_controls2) { queryShaderFloatControls2Features.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryShaderFloatControls2Features; } // query integer dot product memset(&queryShaderIntegerDotProductFeatures, 0, sizeof(queryShaderIntegerDotProductFeatures)); queryShaderIntegerDotProductFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR; queryShaderIntegerDotProductFeatures.pNext = 0; if (support_VK_KHR_shader_integer_dot_product) { queryShaderIntegerDotProductFeatures.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryShaderIntegerDotProductFeatures; } // query subgroup size control memset(&querySubgroupSizeControlFeatures, 0, sizeof(querySubgroupSizeControlFeatures)); querySubgroupSizeControlFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT; querySubgroupSizeControlFeatures.pNext = 0; if (support_VK_EXT_subgroup_size_control >= 2) { querySubgroupSizeControlFeatures.pNext = queryExtensionFeatures; queryExtensionFeatures = &querySubgroupSizeControlFeatures; } // query subgroup rotate memset(&queryShaderSubgroupRotateFeatures, 0, sizeof(queryShaderSubgroupRotateFeatures)); queryShaderSubgroupRotateFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_ROTATE_FEATURES_KHR; queryShaderSubgroupRotateFeatures.pNext = 0; if (support_VK_KHR_shader_subgroup_rotate) { queryShaderSubgroupRotateFeatures.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryShaderSubgroupRotateFeatures; } // query atomic float memset(&queryShaderAtomicFloatFeatures, 0, sizeof(queryShaderAtomicFloatFeatures)); queryShaderAtomicFloatFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT_FEATURES_EXT; queryShaderAtomicFloatFeatures.pNext = 0; if (support_VK_EXT_shader_atomic_float) { queryShaderAtomicFloatFeatures.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryShaderAtomicFloatFeatures; } // query atomic float2 memset(&queryShaderAtomicFloat2Features, 0, sizeof(queryShaderAtomicFloat2Features)); queryShaderAtomicFloat2Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT_2_FEATURES_EXT; queryShaderAtomicFloat2Features.pNext = 0; if (support_VK_EXT_shader_atomic_float2) { queryShaderAtomicFloat2Features.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryShaderAtomicFloat2Features; } // query vulkan memory model memset(&queryVulkanMemoryModelFeatures, 0, sizeof(queryVulkanMemoryModelFeatures)); queryVulkanMemoryModelFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_MEMORY_MODEL_FEATURES_KHR; queryVulkanMemoryModelFeatures.pNext = 0; if (support_VK_KHR_vulkan_memory_model) { queryVulkanMemoryModelFeatures.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryVulkanMemoryModelFeatures; } // query nv cooperative vector memset(&queryCooperativeVectorFeaturesNV, 0, sizeof(queryCooperativeVectorFeaturesNV)); queryCooperativeVectorFeaturesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_VECTOR_FEATURES_NV; queryCooperativeVectorFeaturesNV.pNext = 0; if (support_VK_NV_cooperative_vector) { queryCooperativeVectorFeaturesNV.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryCooperativeVectorFeaturesNV; } if (support_VK_KHR_get_physical_device_properties2) { VkPhysicalDeviceFeatures2KHR queryFeatures; queryFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR; queryFeatures.pNext = queryExtensionFeatures; vkGetPhysicalDeviceFeatures2KHR(physicalDevice, &queryFeatures); } // apply known blacklist if (physicalDeviceProperties.vendorID == 0x13b5 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 0, 82)) { // the 16bit_storage implementation of arm mali driver is buggy :[ query16BitStorageFeatures.storageBuffer16BitAccess = VK_FALSE; } if (physicalDeviceProperties.vendorID == 0x10002 && physicalDeviceProperties.deviceID == 0x70006214 && physicalDeviceProperties.apiVersion == VK_MAKE_VERSION(1, 1, 82)) { // the 16bit_storage implementation of vivante gc1700 driver is buggy :[ query16BitStorageFeatures.storageBuffer16BitAccess = VK_FALSE; } if (bug_implicit_fp16_arithmetic) { // force capability on as long as the driver accept spirv with fp16 arithmetic :D queryFloat16Int8Features.shaderFloat16 = VK_TRUE; } if (physicalDeviceProperties.vendorID == 0x5143 && !query16BitStorageFeatures.storageBuffer16BitAccess) { // fp16 arithmetic yields wrong result on old adreno drivers :( queryFloat16Int8Features.shaderFloat16 = VK_FALSE; } } void GpuInfoPrivate::query_extension_properties() { queryExtensionProperties = 0; // query float controls memset(&queryFloatControlsProperties, 0, sizeof(queryFloatControlsProperties)); queryFloatControlsProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES; queryFloatControlsProperties.pNext = 0; if (support_VK_KHR_shader_float_controls) { queryFloatControlsProperties.pNext = queryExtensionProperties; queryExtensionProperties = &queryFloatControlsProperties; } // query integer dot product memset(&queryShaderIntegerDotProductProperties, 0, sizeof(queryShaderIntegerDotProductProperties)); queryShaderIntegerDotProductProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_PROPERTIES_KHR; queryShaderIntegerDotProductProperties.pNext = 0; if (support_VK_KHR_driver_properties) { queryShaderIntegerDotProductProperties.pNext = queryExtensionProperties; queryExtensionProperties = &queryShaderIntegerDotProductProperties; } // query subgroup memset(&querySubgroupProperties, 0, sizeof(querySubgroupProperties)); querySubgroupProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES; querySubgroupProperties.pNext = 0; if (VK_VERSION_MAJOR(g_instance.instance_api_version) >= 1 && VK_VERSION_MINOR(g_instance.instance_api_version) >= 1) { querySubgroupProperties.pNext = queryExtensionProperties; queryExtensionProperties = &querySubgroupProperties; } else { querySubgroupProperties.subgroupSize = 64; if (physicalDeviceProperties.vendorID == 0x5143) // qcom adreno prefer very large workgroup :P querySubgroupProperties.subgroupSize = 128; if (physicalDeviceProperties.vendorID == 0x13b5) // arm mali querySubgroupProperties.subgroupSize = 16; if (physicalDeviceProperties.vendorID == 0x1010) // imgtec powervr querySubgroupProperties.subgroupSize = 32; if (physicalDeviceProperties.vendorID == 0x1002) // amd querySubgroupProperties.subgroupSize = 64; if (physicalDeviceProperties.vendorID == 0x10de) // nvidia querySubgroupProperties.subgroupSize = 32; if (physicalDeviceProperties.vendorID == 0x8086) // intel querySubgroupProperties.subgroupSize = 32; } // query driver properties memset(&queryDriverProperties, 0, sizeof(queryDriverProperties)); queryDriverProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR; queryDriverProperties.pNext = 0; if (support_VK_KHR_driver_properties) { queryDriverProperties.pNext = queryExtensionProperties; queryExtensionProperties = &queryDriverProperties; } // query subgroup size control memset(&querySubgroupSizeControlProperties, 0, sizeof(querySubgroupSizeControlProperties)); querySubgroupSizeControlProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES_EXT; querySubgroupSizeControlProperties.pNext = 0; if (support_VK_EXT_subgroup_size_control) { querySubgroupSizeControlProperties.pNext = queryExtensionProperties; queryExtensionProperties = &querySubgroupSizeControlProperties; } // query nv cooperative matrix2 memset(&queryCooperativeMatrix2PropertiesNV, 0, sizeof(queryCooperativeMatrix2PropertiesNV)); queryCooperativeMatrix2PropertiesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_PROPERTIES_NV; queryCooperativeMatrix2PropertiesNV.pNext = 0; if (support_VK_NV_cooperative_matrix2) { queryCooperativeMatrix2PropertiesNV.pNext = queryExtensionProperties; queryExtensionProperties = &queryCooperativeMatrix2PropertiesNV; } // query nv cooperative vector memset(&queryCooperativeVectorPropertiesNV, 0, sizeof(queryCooperativeVectorPropertiesNV)); queryCooperativeVectorPropertiesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_VECTOR_PROPERTIES_NV; queryCooperativeVectorPropertiesNV.pNext = 0; if (support_VK_NV_cooperative_vector) { queryCooperativeVectorPropertiesNV.pNext = queryExtensionProperties; queryExtensionProperties = &queryCooperativeVectorPropertiesNV; } if (support_VK_KHR_get_physical_device_properties2) { VkPhysicalDeviceProperties2KHR queryProperties; queryProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR; queryProperties.pNext = queryExtensionProperties; vkGetPhysicalDeviceProperties2KHR(physicalDevice, &queryProperties); // append subgroup rotate if (support_VK_KHR_shader_subgroup_rotate) { if (queryShaderSubgroupRotateFeatures.shaderSubgroupRotate) querySubgroupProperties.supportedOperations |= VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR; if (queryShaderSubgroupRotateFeatures.shaderSubgroupRotateClustered) querySubgroupProperties.supportedOperations |= VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR; } } if (!support_VK_EXT_subgroup_size_control) { querySubgroupSizeControlProperties.minSubgroupSize = querySubgroupProperties.subgroupSize; querySubgroupSizeControlProperties.maxSubgroupSize = querySubgroupProperties.subgroupSize; querySubgroupSizeControlProperties.maxComputeWorkgroupSubgroups = std::max(physicalDeviceProperties.limits.maxComputeWorkGroupInvocations / querySubgroupProperties.subgroupSize, 1u); } // query supported cooperative matrix types and operations queryCooperativeMatrixSubProperties.clear(); queryCooperativeMatrixSubPropertiesNV.clear(); support_cooperative_matrix_8_8_16 = false; support_cooperative_matrix_16_8_8 = false; support_cooperative_matrix_16_8_16 = false; support_cooperative_matrix_16_16_16 = false; if (support_VK_KHR_cooperative_matrix && queryCooperativeMatrixFeatures.cooperativeMatrix) { uint32_t propertyCount = 0; VkResult ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(physicalDevice, &propertyCount, 0); if (ret != VK_SUCCESS) { NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR failed %d", ret); } queryCooperativeMatrixSubProperties.resize(propertyCount); for (uint32_t j = 0; j < propertyCount; j++) { memset(&queryCooperativeMatrixSubProperties[j], 0, sizeof(queryCooperativeMatrixSubProperties[j])); queryCooperativeMatrixSubProperties[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR; queryCooperativeMatrixSubProperties[j].pNext = 0; } ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(physicalDevice, &propertyCount, queryCooperativeMatrixSubProperties.data()); if (ret != VK_SUCCESS) { NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR failed %d", ret); } for (uint32_t j = 0; j < propertyCount; j++) { const VkCooperativeMatrixPropertiesKHR& cmp = queryCooperativeMatrixSubProperties[j]; // NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope); if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16 && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.scope == VK_SCOPE_SUBGROUP_KHR) { support_cooperative_matrix_8_8_16 = true; } if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8 && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.scope == VK_SCOPE_SUBGROUP_KHR) { support_cooperative_matrix_16_8_8 = true; } if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16 && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.scope == VK_SCOPE_SUBGROUP_KHR) { support_cooperative_matrix_16_8_16 = true; } if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16 && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.scope == VK_SCOPE_SUBGROUP_KHR) { support_cooperative_matrix_16_16_16 = true; } } } else if (support_VK_NV_cooperative_matrix && queryCooperativeMatrixFeaturesNV.cooperativeMatrix) { uint32_t propertyCount = 0; VkResult ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, 0); if (ret != VK_SUCCESS) { NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret); } queryCooperativeMatrixSubPropertiesNV.resize(propertyCount); for (uint32_t j = 0; j < propertyCount; j++) { memset(&queryCooperativeMatrixSubPropertiesNV[j], 0, sizeof(queryCooperativeMatrixSubPropertiesNV[j])); queryCooperativeMatrixSubPropertiesNV[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV; queryCooperativeMatrixSubPropertiesNV[j].pNext = 0; } ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, queryCooperativeMatrixSubPropertiesNV.data()); if (ret != VK_SUCCESS) { NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret); } for (uint32_t j = 0; j < propertyCount; j++) { const VkCooperativeMatrixPropertiesNV& cmp = queryCooperativeMatrixSubPropertiesNV[j]; // NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope); if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16 && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.scope == VK_SCOPE_SUBGROUP_NV) { support_cooperative_matrix_8_8_16 = true; } if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8 && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.scope == VK_SCOPE_SUBGROUP_NV) { support_cooperative_matrix_16_8_8 = true; } if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16 && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.scope == VK_SCOPE_SUBGROUP_NV) { support_cooperative_matrix_16_8_16 = true; } if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16 && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.scope == VK_SCOPE_SUBGROUP_NV) { support_cooperative_matrix_16_16_16 = true; } } } // query supported cooperative matrix2 types and operations queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV.clear(); if (support_VK_NV_cooperative_matrix2 && queryCooperativeMatrix2FeaturesNV.cooperativeMatrixFlexibleDimensions) { uint32_t propertyCount = 0; VkResult ret = vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(physicalDevice, &propertyCount, 0); if (ret != VK_SUCCESS) { NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV failed %d", ret); } queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV.resize(propertyCount); for (uint32_t j = 0; j < propertyCount; j++) { memset(&queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV[j], 0, sizeof(queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV[j])); queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_FLEXIBLE_DIMENSIONS_PROPERTIES_NV; queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV[j].pNext = 0; } ret = vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(physicalDevice, &propertyCount, queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV.data()); if (ret != VK_SUCCESS) { NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV failed %d", ret); } for (uint32_t j = 0; j < propertyCount; j++) { const VkCooperativeMatrixFlexibleDimensionsPropertiesNV& cmfdp = queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV[j]; // NCNN_LOGE("cmfdp %2d %2d %2d %d %d %d %d %d %d %d", cmfdp.MGranularity, cmfdp.NGranularity, cmfdp.KGranularity, cmfdp.AType, cmfdp.BType, cmfdp.CType, cmfdp.ResultType, cmfdp.saturatingAccumulation, cmfdp.scope, cmfdp.workgroupInvocations); } } // query supported cooperative vector types and operations queryCooperativeVectorSubPropertiesNV.clear(); if (support_VK_NV_cooperative_vector && queryCooperativeVectorFeaturesNV.cooperativeVector) { uint32_t propertyCount = 0; VkResult ret = vkGetPhysicalDeviceCooperativeVectorPropertiesNV(physicalDevice, &propertyCount, 0); if (ret != VK_SUCCESS) { NCNN_LOGE("vkGetPhysicalDeviceCooperativeVectorPropertiesNV failed %d", ret); } queryCooperativeVectorSubPropertiesNV.resize(propertyCount); for (uint32_t j = 0; j < propertyCount; j++) { memset(&queryCooperativeVectorSubPropertiesNV[j], 0, sizeof(queryCooperativeVectorSubPropertiesNV[j])); queryCooperativeVectorSubPropertiesNV[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_VECTOR_PROPERTIES_NV; queryCooperativeVectorSubPropertiesNV[j].pNext = 0; } ret = vkGetPhysicalDeviceCooperativeVectorPropertiesNV(physicalDevice, &propertyCount, queryCooperativeVectorSubPropertiesNV.data()); if (ret != VK_SUCCESS) { NCNN_LOGE("vkGetPhysicalDeviceCooperativeVectorPropertiesNV failed %d", ret); } for (uint32_t j = 0; j < propertyCount; j++) { const VkCooperativeVectorPropertiesNV& cvp = queryCooperativeVectorSubPropertiesNV[j]; // NCNN_LOGE("cvp %d %d %d %d %d %d", cvp.inputType, cvp.inputInterpretation, cvp.matrixInterpretation, cvp.biasInterpretation, cvp.resultType, cvp.transpose); } } if (queryDriverProperties.driverID == VK_DRIVER_ID_MESA_TURNIP) { // turnip crash when compiling large shader with full subgroup querySubgroupSizeControlFeatures.computeFullSubgroups = VK_FALSE; } } GpuInfo::GpuInfo() : d(new GpuInfoPrivate) { } GpuInfo::~GpuInfo() { delete d; } GpuInfo::GpuInfo(const GpuInfo&) : d(0) { } GpuInfo& GpuInfo::operator=(const GpuInfo&) { return *this; } int GpuInfo::device_index() const { return d->device_index; } VkPhysicalDevice GpuInfo::physicalDevice() const { return d->physicalDevice; } VkPhysicalDevice GpuInfo::physical_device() const { return d->physicalDevice; } const VkPhysicalDeviceFeatures& GpuInfo::physicalDevicefeatures() const { return d->physicalDevicefeatures; } const VkPhysicalDeviceProperties& GpuInfo::physicalDeviceProperties() const { return d->physicalDeviceProperties; } const VkPhysicalDeviceMemoryProperties& GpuInfo::physicalDeviceMemoryProperties() const { return d->physicalDeviceMemoryProperties; } const VkPhysicalDeviceMemoryProperties& GpuInfo::physical_device_memory_properties() const { return d->physicalDeviceMemoryProperties; } const std::vector& GpuInfo::deviceExtensionProperties() const { return d->deviceExtensionProperties; } uint32_t GpuInfo::api_version() const { return d->physicalDeviceProperties.apiVersion; } uint32_t GpuInfo::driver_version() const { return d->physicalDeviceProperties.driverVersion; } uint32_t GpuInfo::vendor_id() const { return d->physicalDeviceProperties.vendorID; } uint32_t GpuInfo::device_id() const { return d->physicalDeviceProperties.deviceID; } const char* GpuInfo::device_name() const { return d->physicalDeviceProperties.deviceName; } uint8_t* GpuInfo::pipeline_cache_uuid() const { return d->physicalDeviceProperties.pipelineCacheUUID; } uint32_t GpuInfo::driver_id() const { return d->queryDriverProperties.driverID; } const char* GpuInfo::driver_name() const { return d->queryDriverProperties.driverName; } int GpuInfo::type() const { return d->type; } uint32_t GpuInfo::max_shared_memory_size() const { return d->physicalDeviceProperties.limits.maxComputeSharedMemorySize; } uint32_t GpuInfo::max_workgroup_count_x() const { return d->physicalDeviceProperties.limits.maxComputeWorkGroupCount[0]; } uint32_t GpuInfo::max_workgroup_count_y() const { return d->physicalDeviceProperties.limits.maxComputeWorkGroupCount[1]; } uint32_t GpuInfo::max_workgroup_count_z() const { return d->physicalDeviceProperties.limits.maxComputeWorkGroupCount[2]; } uint32_t GpuInfo::max_workgroup_invocations() const { return d->physicalDeviceProperties.limits.maxComputeWorkGroupInvocations; } uint32_t GpuInfo::max_workgroup_size_x() const { return d->physicalDeviceProperties.limits.maxComputeWorkGroupSize[0]; } uint32_t GpuInfo::max_workgroup_size_y() const { return d->physicalDeviceProperties.limits.maxComputeWorkGroupSize[1]; } uint32_t GpuInfo::max_workgroup_size_z() const { return d->physicalDeviceProperties.limits.maxComputeWorkGroupSize[2]; } size_t GpuInfo::memory_map_alignment() const { return d->physicalDeviceProperties.limits.minMemoryMapAlignment; } size_t GpuInfo::buffer_offset_alignment() const { return d->physicalDeviceProperties.limits.minStorageBufferOffsetAlignment; } size_t GpuInfo::non_coherent_atom_size() const { return d->physicalDeviceProperties.limits.nonCoherentAtomSize; } size_t GpuInfo::buffer_image_granularity() const { return d->physicalDeviceProperties.limits.bufferImageGranularity; } uint32_t GpuInfo::max_image_dimension_1d() const { return d->physicalDeviceProperties.limits.maxImageDimension1D; } uint32_t GpuInfo::max_image_dimension_2d() const { return d->physicalDeviceProperties.limits.maxImageDimension2D; } uint32_t GpuInfo::max_image_dimension_3d() const { return d->physicalDeviceProperties.limits.maxImageDimension3D; } float GpuInfo::timestamp_period() const { return d->physicalDeviceProperties.limits.timestampPeriod; } uint32_t GpuInfo::compute_queue_family_index() const { return d->compute_queue_family_index; } uint32_t GpuInfo::transfer_queue_family_index() const { return d->transfer_queue_family_index; } uint32_t GpuInfo::compute_queue_count() const { return d->compute_queue_count; } uint32_t GpuInfo::transfer_queue_count() const { return d->transfer_queue_count; } bool GpuInfo::unified_compute_transfer_queue() const { return d->unified_compute_transfer_queue; } uint32_t GpuInfo::subgroup_size() const { return d->querySubgroupProperties.subgroupSize; } uint32_t GpuInfo::min_subgroup_size() const { return d->querySubgroupSizeControlProperties.minSubgroupSize; } uint32_t GpuInfo::max_subgroup_size() const { return d->querySubgroupSizeControlProperties.maxSubgroupSize; } uint32_t GpuInfo::max_compute_workgroup_subgroups() const { return d->querySubgroupSizeControlProperties.maxComputeWorkgroupSubgroups; } bool GpuInfo::support_subgroup_size_control() const { return d->querySubgroupSizeControlFeatures.subgroupSizeControl; } bool GpuInfo::support_compute_full_subgroups() const { return d->querySubgroupSizeControlFeatures.computeFullSubgroups; } uint32_t GpuInfo::support_subgroup_ops() const { return d->querySubgroupProperties.supportedOperations; } bool GpuInfo::bug_storage_buffer_no_l1() const { return d->bug_storage_buffer_no_l1; } bool GpuInfo::bug_corrupted_online_pipeline_cache() const { return d->bug_corrupted_online_pipeline_cache; } bool GpuInfo::bug_buffer_image_load_zero() const { return d->bug_buffer_image_load_zero; } bool GpuInfo::bug_implicit_fp16_arithmetic() const { return d->bug_implicit_fp16_arithmetic; } bool GpuInfo::support_fp16_packed() const { return true; } bool GpuInfo::support_fp16_storage() const { return d->query16BitStorageFeatures.storageBuffer16BitAccess; } bool GpuInfo::support_fp16_uniform() const { return d->query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess; } bool GpuInfo::support_fp16_arithmetic() const { return d->queryFloat16Int8Features.shaderFloat16; } bool GpuInfo::support_int8_packed() const { return true; } bool GpuInfo::support_int8_storage() const { return d->query8BitStorageFeatures.storageBuffer8BitAccess; } bool GpuInfo::support_int8_uniform() const { return d->query8BitStorageFeatures.uniformAndStorageBuffer8BitAccess; } bool GpuInfo::support_int8_arithmetic() const { return d->queryFloat16Int8Features.shaderInt8; } bool GpuInfo::support_fp16_image() const { return d->physicalDevicefeatures.shaderStorageImageExtendedFormats; } bool GpuInfo::support_int8_image() const { return d->physicalDevicefeatures.shaderStorageImageExtendedFormats; } bool GpuInfo::support_fp_fast_math() const { return d->queryShaderFloatControls2Features.shaderFloatControls2; } bool GpuInfo::support_ycbcr_conversion() const { return d->querySamplerYcbcrConversionFeatures.samplerYcbcrConversion; } bool GpuInfo::support_cooperative_matrix() const { return d->queryCooperativeMatrixFeatures.cooperativeMatrix || d->queryCooperativeMatrixFeaturesNV.cooperativeMatrix; } bool GpuInfo::support_cooperative_matrix_8_8_16() const { return d->support_cooperative_matrix_8_8_16; } bool GpuInfo::support_cooperative_matrix_16_8_8() const { return d->support_cooperative_matrix_16_8_8; } bool GpuInfo::support_cooperative_matrix_16_8_16() const { return d->support_cooperative_matrix_16_8_16; } bool GpuInfo::support_cooperative_matrix_16_16_16() const { return d->support_cooperative_matrix_16_16_16; } int GpuInfo::support_VK_KHR_8bit_storage() const { return d->support_VK_KHR_8bit_storage; } int GpuInfo::support_VK_KHR_16bit_storage() const { return d->support_VK_KHR_16bit_storage; } int GpuInfo::support_VK_KHR_bind_memory2() const { return d->support_VK_KHR_bind_memory2; } int GpuInfo::support_VK_KHR_buffer_device_address() const { return d->support_VK_KHR_buffer_device_address; } int GpuInfo::support_VK_KHR_create_renderpass2() const { return d->support_VK_KHR_create_renderpass2; } int GpuInfo::support_VK_KHR_cooperative_matrix() const { return d->support_VK_KHR_cooperative_matrix; } int GpuInfo::support_VK_KHR_dedicated_allocation() const { return d->support_VK_KHR_dedicated_allocation; } int GpuInfo::support_VK_KHR_descriptor_update_template() const { return d->support_VK_KHR_descriptor_update_template; } int GpuInfo::support_VK_KHR_driver_properties() const { return d->support_VK_KHR_driver_properties; } int GpuInfo::support_VK_KHR_external_memory() const { return d->support_VK_KHR_external_memory; } int GpuInfo::support_VK_KHR_get_memory_requirements2() const { return d->support_VK_KHR_get_memory_requirements2; } int GpuInfo::support_VK_KHR_maintenance1() const { return d->support_VK_KHR_maintenance1; } int GpuInfo::support_VK_KHR_maintenance2() const { return d->support_VK_KHR_maintenance2; } int GpuInfo::support_VK_KHR_maintenance3() const { return d->support_VK_KHR_maintenance3; } int GpuInfo::support_VK_KHR_multiview() const { return d->support_VK_KHR_multiview; } int GpuInfo::support_VK_KHR_portability_subset() const { return d->support_VK_KHR_portability_subset; } int GpuInfo::support_VK_KHR_push_descriptor() const { return d->support_VK_KHR_push_descriptor; } int GpuInfo::support_VK_KHR_sampler_ycbcr_conversion() const { return d->support_VK_KHR_sampler_ycbcr_conversion; } int GpuInfo::support_VK_KHR_shader_bfloat16() const { return d->support_VK_KHR_shader_bfloat16; } int GpuInfo::support_VK_KHR_shader_float16_int8() const { return d->support_VK_KHR_shader_float16_int8; } int GpuInfo::support_VK_KHR_shader_float_controls() const { return d->support_VK_KHR_shader_float_controls; } int GpuInfo::support_VK_KHR_shader_float_controls2() const { return d->support_VK_KHR_shader_float_controls2; } int GpuInfo::support_VK_KHR_shader_integer_dot_product() const { return d->support_VK_KHR_shader_integer_dot_product; } int GpuInfo::support_VK_KHR_shader_non_semantic_info() const { return d->support_VK_KHR_shader_non_semantic_info; } int GpuInfo::support_VK_KHR_shader_subgroup_extended_types() const { return d->support_VK_KHR_shader_subgroup_extended_types; } int GpuInfo::support_VK_KHR_shader_subgroup_rotate() const { return d->support_VK_KHR_shader_subgroup_rotate; } int GpuInfo::support_VK_KHR_storage_buffer_storage_class() const { return d->support_VK_KHR_storage_buffer_storage_class; } int GpuInfo::support_VK_KHR_swapchain() const { return d->support_VK_KHR_swapchain; } int GpuInfo::support_VK_KHR_vulkan_memory_model() const { return d->support_VK_KHR_vulkan_memory_model; } int GpuInfo::support_VK_KHR_zero_initialize_workgroup_memory() const { return d->support_VK_KHR_zero_initialize_workgroup_memory; } int GpuInfo::support_VK_EXT_buffer_device_address() const { return d->support_VK_EXT_buffer_device_address; } int GpuInfo::support_VK_EXT_descriptor_indexing() const { return d->support_VK_EXT_descriptor_indexing; } int GpuInfo::support_VK_EXT_memory_budget() const { return d->support_VK_EXT_memory_budget; } int GpuInfo::support_VK_EXT_memory_priority() const { return d->support_VK_EXT_memory_priority; } int GpuInfo::support_VK_EXT_queue_family_foreign() const { return d->support_VK_EXT_queue_family_foreign; } int GpuInfo::support_VK_EXT_shader_atomic_float() const { return d->support_VK_EXT_shader_atomic_float; } int GpuInfo::support_VK_EXT_shader_atomic_float2() const { return d->support_VK_EXT_shader_atomic_float2; } int GpuInfo::support_VK_EXT_shader_float8() const { return d->support_VK_EXT_shader_float8; } int GpuInfo::support_VK_EXT_subgroup_size_control() const { return d->support_VK_EXT_subgroup_size_control; } int GpuInfo::support_VK_AMD_device_coherent_memory() const { return d->support_VK_AMD_device_coherent_memory; } #if __ANDROID_API__ >= 26 int GpuInfo::support_VK_ANDROID_external_memory_android_hardware_buffer() const { return d->support_VK_ANDROID_external_memory_android_hardware_buffer; } #endif // __ANDROID_API__ >= 26 int GpuInfo::support_VK_NV_cooperative_matrix() const { return d->support_VK_NV_cooperative_matrix; } int GpuInfo::support_VK_NV_cooperative_matrix2() const { return d->support_VK_NV_cooperative_matrix2; } int GpuInfo::support_VK_NV_cooperative_vector() const { return d->support_VK_NV_cooperative_vector; } const void* GpuInfo::queryExtensionFeatures() const { return d->queryExtensionFeatures; } const VkPhysicalDevice8BitStorageFeaturesKHR& GpuInfo::query8BitStorageFeatures() const { return d->query8BitStorageFeatures; } const VkPhysicalDevice16BitStorageFeaturesKHR& GpuInfo::query16BitStorageFeatures() const { return d->query16BitStorageFeatures; } const VkPhysicalDeviceFloat16Int8FeaturesKHR& GpuInfo::queryFloat16Int8Features() const { return d->queryFloat16Int8Features; } const VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR& GpuInfo::querySamplerYcbcrConversionFeatures() const { return d->querySamplerYcbcrConversionFeatures; } const VkPhysicalDeviceCooperativeMatrixFeaturesKHR& GpuInfo::queryCooperativeMatrixFeatures() const { return d->queryCooperativeMatrixFeatures; } const VkPhysicalDeviceCooperativeMatrixFeaturesNV& GpuInfo::queryCooperativeMatrixFeaturesNV() const { return d->queryCooperativeMatrixFeaturesNV; } const VkPhysicalDeviceCooperativeMatrix2FeaturesNV& GpuInfo::queryCooperativeMatrix2FeaturesNV() const { return d->queryCooperativeMatrix2FeaturesNV; } const VkPhysicalDeviceCooperativeVectorFeaturesNV& GpuInfo::queryCooperativeVectorFeaturesNV() const { return d->queryCooperativeVectorFeaturesNV; } const VkPhysicalDeviceSubgroupSizeControlFeaturesEXT& GpuInfo::querySubgroupSizeControlFeatures() const { return d->querySubgroupSizeControlFeatures; } const VkPhysicalDeviceShaderBfloat16FeaturesKHR& GpuInfo::queryShaderBfloat16Features() const { return d->queryShaderBfloat16Features; } const VkPhysicalDeviceShaderFloat8FeaturesEXT& GpuInfo::queryShaderFloat8Features() const { return d->queryShaderFloat8Features; } const VkPhysicalDeviceShaderFloatControls2FeaturesKHR& GpuInfo::queryShaderFloatControls2Features() const { return d->queryShaderFloatControls2Features; } const VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR& GpuInfo::queryShaderIntegerDotProductFeatures() const { return d->queryShaderIntegerDotProductFeatures; } const VkPhysicalDeviceShaderSubgroupRotateFeaturesKHR& GpuInfo::queryShaderSubgroupRotateFeatures() const { return d->queryShaderSubgroupRotateFeatures; } const VkPhysicalDeviceShaderAtomicFloatFeaturesEXT& GpuInfo::queryShaderAtomicFloatFeatures() const { return d->queryShaderAtomicFloatFeatures; } const VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT& GpuInfo::queryShaderAtomicFloat2Features() const { return d->queryShaderAtomicFloat2Features; } const VkPhysicalDeviceVulkanMemoryModelFeaturesKHR& GpuInfo::queryVulkanMemoryModelFeatures() const { return d->queryVulkanMemoryModelFeatures; } const void* GpuInfo::queryExtensionProperties() const { return d->queryExtensionProperties; } const VkPhysicalDeviceCooperativeMatrix2PropertiesNV& GpuInfo::queryCooperativeMatrix2PropertiesNV() const { return d->queryCooperativeMatrix2PropertiesNV; } const VkPhysicalDeviceCooperativeVectorPropertiesNV& GpuInfo::queryCooperativeVectorPropertiesNV() const { return d->queryCooperativeVectorPropertiesNV; } const VkPhysicalDeviceDriverPropertiesKHR& GpuInfo::queryDriverProperties() const { return d->queryDriverProperties; } const VkPhysicalDeviceFloatControlsPropertiesKHR& GpuInfo::queryFloatControlsProperties() const { return d->queryFloatControlsProperties; } const VkPhysicalDeviceShaderIntegerDotProductProperties& GpuInfo::queryShaderIntegerDotProductProperties() const { return d->queryShaderIntegerDotProductProperties; } const VkPhysicalDeviceSubgroupProperties& GpuInfo::querySubgroupProperties() const { return d->querySubgroupProperties; } const VkPhysicalDeviceSubgroupSizeControlPropertiesEXT& GpuInfo::querySubgroupSizeControlProperties() const { return d->querySubgroupSizeControlProperties; } const std::vector& GpuInfo::queryCooperativeMatrixSubProperties() const { return d->queryCooperativeMatrixSubProperties; } const std::vector& GpuInfo::queryCooperativeMatrixSubPropertiesNV() const { return d->queryCooperativeMatrixSubPropertiesNV; } const std::vector& GpuInfo::queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV() const { return d->queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV; } const std::vector& GpuInfo::queryCooperativeVectorSubPropertiesNV() const { return d->queryCooperativeVectorSubPropertiesNV; } void GpuInfo::get_optimal_cooperative_matrix_mnk(int M, int N, int K, VkComponentTypeKHR type, VkComponentTypeKHR acctype, VkScopeKHR scope, int& coopmat_M, int& coopmat_N, int& coopmat_K) const { coopmat_M = 0; coopmat_N = 0; coopmat_K = 0; // collect mnk candidates std::vector mnk_properties; if (d->support_VK_KHR_cooperative_matrix && d->queryCooperativeMatrixFeatures.cooperativeMatrix) { for (size_t i = 0; i < d->queryCooperativeMatrixSubProperties.size(); i++) { const VkCooperativeMatrixPropertiesKHR& cmp = d->queryCooperativeMatrixSubProperties[i]; if (cmp.AType == type && cmp.BType == type && cmp.CType == acctype && cmp.ResultType == acctype && cmp.scope == scope) { mnk_properties.push_back(cmp); } } } else if (d->support_VK_NV_cooperative_matrix && d->queryCooperativeMatrixFeaturesNV.cooperativeMatrix) { for (size_t i = 0; i < d->queryCooperativeMatrixSubPropertiesNV.size(); i++) { const VkCooperativeMatrixPropertiesNV& cmp = d->queryCooperativeMatrixSubPropertiesNV[i]; if (cmp.AType == (VkComponentTypeNV)type && cmp.BType == (VkComponentTypeNV)type && cmp.CType == (VkComponentTypeNV)acctype && cmp.DType == (VkComponentTypeNV)acctype && cmp.scope == (VkScopeNV)scope) { VkCooperativeMatrixPropertiesKHR cmp_khr; cmp_khr.MSize = cmp.MSize; cmp_khr.NSize = cmp.NSize; cmp_khr.KSize = cmp.KSize; mnk_properties.push_back(cmp_khr); } } } if (mnk_properties.empty() && (acctype == VK_COMPONENT_TYPE_FLOAT16_KHR || acctype == VK_COMPONENT_TYPE_BFLOAT16_KHR)) { // try acctype fp32 return get_optimal_cooperative_matrix_mnk(M, N, K, type, VK_COMPONENT_TYPE_FLOAT32_KHR, scope, coopmat_M, coopmat_N, coopmat_K); } if (mnk_properties.empty()) return; // find the optimal, prefer the first mnk tuple with same cost double min_cost = DBL_MAX; for (size_t i = 0; i < mnk_properties.size(); i++) { const VkCooperativeMatrixPropertiesKHR& cmp = mnk_properties[i]; const int M_pad = (M + cmp.MSize - 1) / cmp.MSize * cmp.MSize; const int N_pad = (N + cmp.NSize - 1) / cmp.NSize * cmp.NSize; const int K_pad = (K + cmp.KSize - 1) / cmp.KSize * cmp.KSize; double cost = M_pad * N_pad * K_pad - M * N * K; if (cost < min_cost) { min_cost = cost; coopmat_M = cmp.MSize; coopmat_N = cmp.NSize; coopmat_K = cmp.KSize; } } } static int init_instance_core() { vkAllocateCommandBuffers = (PFN_vkAllocateCommandBuffers)vkGetInstanceProcAddr(g_instance, "vkAllocateCommandBuffers"); vkAllocateDescriptorSets = (PFN_vkAllocateDescriptorSets)vkGetInstanceProcAddr(g_instance, "vkAllocateDescriptorSets"); vkAllocateMemory = (PFN_vkAllocateMemory)vkGetInstanceProcAddr(g_instance, "vkAllocateMemory"); vkBeginCommandBuffer = (PFN_vkBeginCommandBuffer)vkGetInstanceProcAddr(g_instance, "vkBeginCommandBuffer"); vkBindBufferMemory = (PFN_vkBindBufferMemory)vkGetInstanceProcAddr(g_instance, "vkBindBufferMemory"); vkBindImageMemory = (PFN_vkBindImageMemory)vkGetInstanceProcAddr(g_instance, "vkBindImageMemory"); vkCmdBeginQuery = (PFN_vkCmdBeginQuery)vkGetInstanceProcAddr(g_instance, "vkCmdBeginQuery"); vkCmdBindDescriptorSets = (PFN_vkCmdBindDescriptorSets)vkGetInstanceProcAddr(g_instance, "vkCmdBindDescriptorSets"); vkCmdBindIndexBuffer = (PFN_vkCmdBindIndexBuffer)vkGetInstanceProcAddr(g_instance, "vkCmdBindIndexBuffer"); vkCmdBindPipeline = (PFN_vkCmdBindPipeline)vkGetInstanceProcAddr(g_instance, "vkCmdBindPipeline"); vkCmdCopyBuffer = (PFN_vkCmdCopyBuffer)vkGetInstanceProcAddr(g_instance, "vkCmdCopyBuffer"); vkCmdCopyBufferToImage = (PFN_vkCmdCopyBufferToImage)vkGetInstanceProcAddr(g_instance, "vkCmdCopyBufferToImage"); vkCmdCopyImage = (PFN_vkCmdCopyImage)vkGetInstanceProcAddr(g_instance, "vkCmdCopyImage"); vkCmdCopyImageToBuffer = (PFN_vkCmdCopyImageToBuffer)vkGetInstanceProcAddr(g_instance, "vkCmdCopyImageToBuffer"); vkCmdCopyQueryPoolResults = (PFN_vkCmdCopyQueryPoolResults)vkGetInstanceProcAddr(g_instance, "vkCmdCopyQueryPoolResults"); vkCmdDispatch = (PFN_vkCmdDispatch)vkGetInstanceProcAddr(g_instance, "vkCmdDispatch"); vkCmdDispatchIndirect = (PFN_vkCmdDispatchIndirect)vkGetInstanceProcAddr(g_instance, "vkCmdDispatchIndirect"); vkCmdEndQuery = (PFN_vkCmdEndQuery)vkGetInstanceProcAddr(g_instance, "vkCmdEndQuery"); vkCmdExecuteCommands = (PFN_vkCmdExecuteCommands)vkGetInstanceProcAddr(g_instance, "vkCmdExecuteCommands"); vkCmdFillBuffer = (PFN_vkCmdFillBuffer)vkGetInstanceProcAddr(g_instance, "vkCmdFillBuffer"); vkCmdPipelineBarrier = (PFN_vkCmdPipelineBarrier)vkGetInstanceProcAddr(g_instance, "vkCmdPipelineBarrier"); vkCmdPushConstants = (PFN_vkCmdPushConstants)vkGetInstanceProcAddr(g_instance, "vkCmdPushConstants"); vkCmdResetQueryPool = (PFN_vkCmdResetQueryPool)vkGetInstanceProcAddr(g_instance, "vkCmdResetQueryPool"); vkCmdResolveImage = (PFN_vkCmdResolveImage)vkGetInstanceProcAddr(g_instance, "vkCmdResolveImage"); vkCmdUpdateBuffer = (PFN_vkCmdUpdateBuffer)vkGetInstanceProcAddr(g_instance, "vkCmdUpdateBuffer"); vkCmdWriteTimestamp = (PFN_vkCmdWriteTimestamp)vkGetInstanceProcAddr(g_instance, "vkCmdWriteTimestamp"); vkCreateBuffer = (PFN_vkCreateBuffer)vkGetInstanceProcAddr(g_instance, "vkCreateBuffer"); vkCreateBufferView = (PFN_vkCreateBufferView)vkGetInstanceProcAddr(g_instance, "vkCreateBufferView"); vkCreateCommandPool = (PFN_vkCreateCommandPool)vkGetInstanceProcAddr(g_instance, "vkCreateCommandPool"); vkCreateComputePipelines = (PFN_vkCreateComputePipelines)vkGetInstanceProcAddr(g_instance, "vkCreateComputePipelines"); vkCreateDescriptorPool = (PFN_vkCreateDescriptorPool)vkGetInstanceProcAddr(g_instance, "vkCreateDescriptorPool"); vkCreateDescriptorSetLayout = (PFN_vkCreateDescriptorSetLayout)vkGetInstanceProcAddr(g_instance, "vkCreateDescriptorSetLayout"); vkCreateDevice = (PFN_vkCreateDevice)vkGetInstanceProcAddr(g_instance, "vkCreateDevice"); vkCreateFence = (PFN_vkCreateFence)vkGetInstanceProcAddr(g_instance, "vkCreateFence"); vkCreateImage = (PFN_vkCreateImage)vkGetInstanceProcAddr(g_instance, "vkCreateImage"); vkCreateImageView = (PFN_vkCreateImageView)vkGetInstanceProcAddr(g_instance, "vkCreateImageView"); vkCreatePipelineCache = (PFN_vkCreatePipelineCache)vkGetInstanceProcAddr(g_instance, "vkCreatePipelineCache"); vkCreatePipelineLayout = (PFN_vkCreatePipelineLayout)vkGetInstanceProcAddr(g_instance, "vkCreatePipelineLayout"); vkCreateQueryPool = (PFN_vkCreateQueryPool)vkGetInstanceProcAddr(g_instance, "vkCreateQueryPool"); vkCreateSampler = (PFN_vkCreateSampler)vkGetInstanceProcAddr(g_instance, "vkCreateSampler"); vkCreateSemaphore = (PFN_vkCreateSemaphore)vkGetInstanceProcAddr(g_instance, "vkCreateSemaphore"); vkCreateShaderModule = (PFN_vkCreateShaderModule)vkGetInstanceProcAddr(g_instance, "vkCreateShaderModule"); vkDestroyBuffer = (PFN_vkDestroyBuffer)vkGetInstanceProcAddr(g_instance, "vkDestroyBuffer"); vkDestroyBufferView = (PFN_vkDestroyBufferView)vkGetInstanceProcAddr(g_instance, "vkDestroyBufferView"); vkDestroyCommandPool = (PFN_vkDestroyCommandPool)vkGetInstanceProcAddr(g_instance, "vkDestroyCommandPool"); vkDestroyDescriptorPool = (PFN_vkDestroyDescriptorPool)vkGetInstanceProcAddr(g_instance, "vkDestroyDescriptorPool"); vkDestroyDescriptorSetLayout = (PFN_vkDestroyDescriptorSetLayout)vkGetInstanceProcAddr(g_instance, "vkDestroyDescriptorSetLayout"); vkDestroyDevice = (PFN_vkDestroyDevice)vkGetInstanceProcAddr(g_instance, "vkDestroyDevice"); vkDestroyFence = (PFN_vkDestroyFence)vkGetInstanceProcAddr(g_instance, "vkDestroyFence"); vkDestroyImage = (PFN_vkDestroyImage)vkGetInstanceProcAddr(g_instance, "vkDestroyImage"); vkDestroyImageView = (PFN_vkDestroyImageView)vkGetInstanceProcAddr(g_instance, "vkDestroyImageView"); vkDestroyInstance = (PFN_vkDestroyInstance)vkGetInstanceProcAddr(g_instance, "vkDestroyInstance"); vkDestroyPipeline = (PFN_vkDestroyPipeline)vkGetInstanceProcAddr(g_instance, "vkDestroyPipeline"); vkDestroyPipelineCache = (PFN_vkDestroyPipelineCache)vkGetInstanceProcAddr(g_instance, "vkDestroyPipelineCache"); vkDestroyPipelineLayout = (PFN_vkDestroyPipelineLayout)vkGetInstanceProcAddr(g_instance, "vkDestroyPipelineLayout"); vkDestroyQueryPool = (PFN_vkDestroyQueryPool)vkGetInstanceProcAddr(g_instance, "vkDestroyQueryPool"); vkDestroySampler = (PFN_vkDestroySampler)vkGetInstanceProcAddr(g_instance, "vkDestroySampler"); vkDestroySemaphore = (PFN_vkDestroySemaphore)vkGetInstanceProcAddr(g_instance, "vkDestroySemaphore"); vkDestroyShaderModule = (PFN_vkDestroyShaderModule)vkGetInstanceProcAddr(g_instance, "vkDestroyShaderModule"); vkDeviceWaitIdle = (PFN_vkDeviceWaitIdle)vkGetInstanceProcAddr(g_instance, "vkDeviceWaitIdle"); vkEndCommandBuffer = (PFN_vkEndCommandBuffer)vkGetInstanceProcAddr(g_instance, "vkEndCommandBuffer"); vkEnumerateDeviceExtensionProperties = (PFN_vkEnumerateDeviceExtensionProperties)vkGetInstanceProcAddr(g_instance, "vkEnumerateDeviceExtensionProperties"); vkEnumerateDeviceLayerProperties = (PFN_vkEnumerateDeviceLayerProperties)vkGetInstanceProcAddr(g_instance, "vkEnumerateDeviceLayerProperties"); vkEnumeratePhysicalDevices = (PFN_vkEnumeratePhysicalDevices)vkGetInstanceProcAddr(g_instance, "vkEnumeratePhysicalDevices"); vkFlushMappedMemoryRanges = (PFN_vkFlushMappedMemoryRanges)vkGetInstanceProcAddr(g_instance, "vkFlushMappedMemoryRanges"); vkFreeCommandBuffers = (PFN_vkFreeCommandBuffers)vkGetInstanceProcAddr(g_instance, "vkFreeCommandBuffers"); vkFreeDescriptorSets = (PFN_vkFreeDescriptorSets)vkGetInstanceProcAddr(g_instance, "vkFreeDescriptorSets"); vkFreeMemory = (PFN_vkFreeMemory)vkGetInstanceProcAddr(g_instance, "vkFreeMemory"); vkGetBufferMemoryRequirements = (PFN_vkGetBufferMemoryRequirements)vkGetInstanceProcAddr(g_instance, "vkGetBufferMemoryRequirements"); vkGetDeviceMemoryCommitment = (PFN_vkGetDeviceMemoryCommitment)vkGetInstanceProcAddr(g_instance, "vkGetDeviceMemoryCommitment"); vkGetDeviceProcAddr = (PFN_vkGetDeviceProcAddr)vkGetInstanceProcAddr(g_instance, "vkGetDeviceProcAddr"); vkGetDeviceQueue = (PFN_vkGetDeviceQueue)vkGetInstanceProcAddr(g_instance, "vkGetDeviceQueue"); vkGetFenceStatus = (PFN_vkGetFenceStatus)vkGetInstanceProcAddr(g_instance, "vkGetFenceStatus"); vkGetImageMemoryRequirements = (PFN_vkGetImageMemoryRequirements)vkGetInstanceProcAddr(g_instance, "vkGetImageMemoryRequirements"); vkGetImageSubresourceLayout = (PFN_vkGetImageSubresourceLayout)vkGetInstanceProcAddr(g_instance, "vkGetImageSubresourceLayout"); vkGetPhysicalDeviceFeatures = (PFN_vkGetPhysicalDeviceFeatures)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFeatures"); vkGetPhysicalDeviceFormatProperties = (PFN_vkGetPhysicalDeviceFormatProperties)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFormatProperties"); vkGetPhysicalDeviceImageFormatProperties = (PFN_vkGetPhysicalDeviceImageFormatProperties)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceImageFormatProperties"); vkGetPhysicalDeviceMemoryProperties = (PFN_vkGetPhysicalDeviceMemoryProperties)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceMemoryProperties"); vkGetPhysicalDeviceProperties = (PFN_vkGetPhysicalDeviceProperties)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceProperties"); vkGetPhysicalDeviceQueueFamilyProperties = (PFN_vkGetPhysicalDeviceQueueFamilyProperties)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceQueueFamilyProperties"); vkGetPipelineCacheData = (PFN_vkGetPipelineCacheData)vkGetInstanceProcAddr(g_instance, "vkGetPipelineCacheData"); vkGetQueryPoolResults = (PFN_vkGetQueryPoolResults)vkGetInstanceProcAddr(g_instance, "vkGetQueryPoolResults"); vkInvalidateMappedMemoryRanges = (PFN_vkInvalidateMappedMemoryRanges)vkGetInstanceProcAddr(g_instance, "vkInvalidateMappedMemoryRanges"); vkMapMemory = (PFN_vkMapMemory)vkGetInstanceProcAddr(g_instance, "vkMapMemory"); vkMergePipelineCaches = (PFN_vkMergePipelineCaches)vkGetInstanceProcAddr(g_instance, "vkMergePipelineCaches"); vkQueueSubmit = (PFN_vkQueueSubmit)vkGetInstanceProcAddr(g_instance, "vkQueueSubmit"); vkQueueWaitIdle = (PFN_vkQueueWaitIdle)vkGetInstanceProcAddr(g_instance, "vkQueueWaitIdle"); vkResetCommandBuffer = (PFN_vkResetCommandBuffer)vkGetInstanceProcAddr(g_instance, "vkResetCommandBuffer"); vkResetCommandPool = (PFN_vkResetCommandPool)vkGetInstanceProcAddr(g_instance, "vkResetCommandPool"); vkResetDescriptorPool = (PFN_vkResetDescriptorPool)vkGetInstanceProcAddr(g_instance, "vkResetDescriptorPool"); vkResetFences = (PFN_vkResetFences)vkGetInstanceProcAddr(g_instance, "vkResetFences"); vkUnmapMemory = (PFN_vkUnmapMemory)vkGetInstanceProcAddr(g_instance, "vkUnmapMemory"); vkUpdateDescriptorSets = (PFN_vkUpdateDescriptorSets)vkGetInstanceProcAddr(g_instance, "vkUpdateDescriptorSets"); vkWaitForFences = (PFN_vkWaitForFences)vkGetInstanceProcAddr(g_instance, "vkWaitForFences"); return 0; } static int init_instance_extension() { if (support_VK_KHR_external_memory_capabilities) { vkGetPhysicalDeviceExternalBufferPropertiesKHR = (PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceExternalBufferPropertiesKHR"); } if (support_VK_KHR_get_physical_device_properties2) { vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFeatures2KHR"); vkGetPhysicalDeviceProperties2KHR = (PFN_vkGetPhysicalDeviceProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceProperties2KHR"); vkGetPhysicalDeviceFormatProperties2KHR = (PFN_vkGetPhysicalDeviceFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFormatProperties2KHR"); vkGetPhysicalDeviceImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceImageFormatProperties2KHR"); vkGetPhysicalDeviceQueueFamilyProperties2KHR = (PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceQueueFamilyProperties2KHR"); vkGetPhysicalDeviceMemoryProperties2KHR = (PFN_vkGetPhysicalDeviceMemoryProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceMemoryProperties2KHR"); } if (support_VK_KHR_get_surface_capabilities2) { vkGetPhysicalDeviceSurfaceCapabilities2KHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilities2KHR"); vkGetPhysicalDeviceSurfaceFormats2KHR = (PFN_vkGetPhysicalDeviceSurfaceFormats2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormats2KHR"); } if (support_VK_KHR_surface) { vkDestroySurfaceKHR = (PFN_vkDestroySurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkDestroySurfaceKHR"); vkGetPhysicalDeviceSurfaceSupportKHR = (PFN_vkGetPhysicalDeviceSurfaceSupportKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceSupportKHR"); vkGetPhysicalDeviceSurfaceCapabilitiesKHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilitiesKHR"); vkGetPhysicalDeviceSurfaceFormatsKHR = (PFN_vkGetPhysicalDeviceSurfaceFormatsKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormatsKHR"); vkGetPhysicalDeviceSurfacePresentModesKHR = (PFN_vkGetPhysicalDeviceSurfacePresentModesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfacePresentModesKHR"); } #if __ANDROID_API__ >= 26 if (support_VK_KHR_android_surface) { vkCreateAndroidSurfaceKHR = (PFN_vkCreateAndroidSurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkCreateAndroidSurfaceKHR"); } #endif // __ANDROID_API__ >= 26 // VK_KHR_cooperative_matrix { vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR"); } // VK_NV_cooperative_matrix { vkGetPhysicalDeviceCooperativeMatrixPropertiesNV = (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesNV"); } // VK_NV_cooperative_matrix2 { vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV = (PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV"); } // VK_NV_cooperative_vector { vkGetPhysicalDeviceCooperativeVectorPropertiesNV = (PFN_vkGetPhysicalDeviceCooperativeVectorPropertiesNV)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceCooperativeVectorPropertiesNV"); } return 0; } #if ENABLE_VALIDATION_LAYER static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback( VkDebugUtilsMessageSeverityFlagBitsEXT /*messageSeverity*/, VkDebugUtilsMessageTypeFlagsEXT /*messageType*/, const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData, void* /*pUserData*/) { NCNN_LOGE("validation layer: %s", pCallbackData->pMessage); return VK_FALSE; } static VkResult CreateDebugUtilsMessengerEXT(VkInstance instance, const VkDebugUtilsMessengerCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugUtilsMessengerEXT* pCallback) { PFN_vkCreateDebugUtilsMessengerEXT func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkCreateDebugUtilsMessengerEXT"); if (func) return func(instance, pCreateInfo, pAllocator, pCallback); return VK_ERROR_EXTENSION_NOT_PRESENT; } static void DestroyDebugUtilsMessengerEXT(VkInstance instance, VkDebugUtilsMessengerEXT callback, const VkAllocationCallbacks* pAllocator) { PFN_vkDestroyDebugUtilsMessengerEXT func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkDestroyDebugUtilsMessengerEXT"); if (func) func(instance, callback, pAllocator); } #endif // ENABLE_VALIDATION_LAYER static int find_default_vulkan_device_index() { // first try, discrete gpu for (int i = 0; i < g_gpu_count; i++) { if (g_gpu_infos[i]->type() == 0) return i; } // second try, integrated gpu for (int i = 0; i < g_gpu_count; i++) { if (g_gpu_infos[i]->type() == 1) return i; } // third try, any probed device if (g_gpu_count > 0) return 0; NCNN_LOGE("no vulkan device"); return -1; } int create_gpu_instance(const char* driver_path) { MutexLockGuard lock(g_instance_lock); if (g_instance.created != 0) return g_instance.instance ? 0 : -1; g_instance.created = 1; // NCNN_LOGE("create_gpu_instance"); #if NCNN_SIMPLEVK // load vulkan driver { int ret = load_vulkan_driver(driver_path); if (ret != 0) { NCNN_LOGE("load vulkan driver failed"); return -1; } } #else if (driver_path) { NCNN_LOGE("custom vulkan driver is not supported when NCNN_SIMPLEVK is off"); NCNN_LOGE("will always use the system vulkan driver"); } #endif // NCNN_SIMPLEVK VkResult ret; std::vector enabledLayers; #if ENABLE_VALIDATION_LAYER uint32_t instanceLayerPropertyCount; ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, NULL); if (ret != VK_SUCCESS) { NCNN_LOGE("vkEnumerateInstanceLayerProperties failed %d", ret); return -1; } std::vector instanceLayerProperties(instanceLayerPropertyCount); ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, instanceLayerProperties.data()); if (ret != VK_SUCCESS) { NCNN_LOGE("vkEnumerateInstanceLayerProperties failed %d", ret); return -1; } for (uint32_t i = 0; i < instanceLayerPropertyCount; i++) { const VkLayerProperties& lp = instanceLayerProperties[i]; // NCNN_LOGE("instance layer %s = %u", lp.layerName, lp.implementationVersion); if (strcmp(lp.layerName, "VK_LAYER_LUNARG_standard_validation") == 0) { enabledLayers.push_back("VK_LAYER_LUNARG_standard_validation"); } if (strcmp(lp.layerName, "VK_LAYER_LUNARG_parameter_validation") == 0) { enabledLayers.push_back("VK_LAYER_LUNARG_parameter_validation"); } if (strcmp(lp.layerName, "VK_LAYER_KHRONOS_validation") == 0) { enabledLayers.push_back("VK_LAYER_KHRONOS_validation"); } } #endif // ENABLE_VALIDATION_LAYER std::vector enabledExtensions; uint32_t instanceExtensionPropertyCount; ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, NULL); if (ret != VK_SUCCESS) { NCNN_LOGE("vkEnumerateInstanceExtensionProperties failed %d", ret); return -1; } std::vector instanceExtensionProperties(instanceExtensionPropertyCount); ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, instanceExtensionProperties.data()); if (ret != VK_SUCCESS) { NCNN_LOGE("vkEnumerateInstanceExtensionProperties failed %d", ret); return -1; } support_VK_KHR_get_physical_device_properties2 = 0; support_VK_KHR_get_surface_capabilities2 = 0; support_VK_KHR_portability_enumeration = 0; support_VK_KHR_surface = 0; support_VK_EXT_debug_utils = 0; support_VK_EXT_validation_features = 0; support_VK_EXT_validation_flags = 0; #if __ANDROID_API__ >= 26 support_VK_KHR_android_surface = 0; #endif // __ANDROID_API__ >= 26 for (uint32_t j = 0; j < instanceExtensionPropertyCount; j++) { const VkExtensionProperties& exp = instanceExtensionProperties[j]; // NCNN_LOGE("instance extension %s = %u", exp.extensionName, exp.specVersion); if (strcmp(exp.extensionName, "VK_KHR_external_memory_capabilities") == 0) support_VK_KHR_external_memory_capabilities = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_get_physical_device_properties2") == 0) support_VK_KHR_get_physical_device_properties2 = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_get_surface_capabilities2") == 0) support_VK_KHR_get_surface_capabilities2 = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_portability_enumeration") == 0) support_VK_KHR_portability_enumeration = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_surface") == 0) support_VK_KHR_surface = exp.specVersion; else if (strcmp(exp.extensionName, "VK_EXT_debug_utils") == 0) support_VK_EXT_debug_utils = exp.specVersion; else if (strcmp(exp.extensionName, "VK_EXT_validation_features") == 0) support_VK_EXT_validation_features = exp.specVersion; else if (strcmp(exp.extensionName, "VK_EXT_validation_flags") == 0) support_VK_EXT_validation_flags = exp.specVersion; #if __ANDROID_API__ >= 26 else if (strcmp(exp.extensionName, "VK_KHR_android_surface") == 0) support_VK_KHR_android_surface = exp.specVersion; #endif // __ANDROID_API__ >= 26 } if (support_VK_EXT_validation_features) { // we prefer the modern one support_VK_EXT_validation_flags = 0; } if (support_VK_KHR_external_memory_capabilities) enabledExtensions.push_back("VK_KHR_external_memory_capabilities"); if (support_VK_KHR_get_physical_device_properties2) enabledExtensions.push_back("VK_KHR_get_physical_device_properties2"); if (support_VK_KHR_get_surface_capabilities2) enabledExtensions.push_back("VK_KHR_get_surface_capabilities2"); if (support_VK_KHR_portability_enumeration) enabledExtensions.push_back("VK_KHR_portability_enumeration"); if (support_VK_KHR_surface) enabledExtensions.push_back("VK_KHR_surface"); #if ENABLE_VALIDATION_LAYER if (support_VK_EXT_debug_utils) enabledExtensions.push_back("VK_EXT_debug_utils"); if (support_VK_EXT_validation_features) enabledExtensions.push_back("VK_EXT_validation_features"); if (support_VK_EXT_validation_flags) enabledExtensions.push_back("VK_EXT_validation_flags"); #endif // ENABLE_VALIDATION_LAYER #if __ANDROID_API__ >= 26 if (support_VK_KHR_android_surface) enabledExtensions.push_back("VK_KHR_android_surface"); #endif // __ANDROID_API__ >= 26 uint32_t instance_api_version = VK_MAKE_VERSION(1, 0, 0); typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t * pApiVersion); PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion = (PFN_vkEnumerateInstanceVersion)vkGetInstanceProcAddr(0, "vkEnumerateInstanceVersion"); if (vkEnumerateInstanceVersion) { ret = vkEnumerateInstanceVersion(&instance_api_version); if (ret != VK_SUCCESS) { NCNN_LOGE("vkEnumerateInstanceVersion failed %d", ret); return -1; } } // NCNN_LOGE("instance apiVersion = %u.%u.%u", VK_VERSION_MAJOR(instance_api_version), VK_VERSION_MINOR(instance_api_version), VK_VERSION_PATCH(instance_api_version)); VkApplicationInfo applicationInfo; applicationInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; applicationInfo.pNext = 0; applicationInfo.pApplicationName = "ncnn"; applicationInfo.applicationVersion = 0; applicationInfo.pEngineName = "ncnn"; applicationInfo.engineVersion = 20250327; applicationInfo.apiVersion = instance_api_version; void* enabledExtensionFeatures = 0; #if ENABLE_VALIDATION_LAYER std::vector enabledValidationFeature; enabledValidationFeature.push_back(VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT); enabledValidationFeature.push_back(VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT); enabledValidationFeature.push_back(VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT); enabledValidationFeature.push_back(VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT); enabledValidationFeature.push_back(VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT); VkValidationFeaturesEXT validationFeatures; validationFeatures.sType = VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT; validationFeatures.pNext = 0; validationFeatures.enabledValidationFeatureCount = enabledValidationFeature.size(); validationFeatures.pEnabledValidationFeatures = enabledValidationFeature.data(); validationFeatures.disabledValidationFeatureCount = 0; validationFeatures.pDisabledValidationFeatures = 0; if (support_VK_EXT_validation_features) { validationFeatures.pNext = enabledExtensionFeatures; enabledExtensionFeatures = &validationFeatures; } VkValidationFlagsEXT validationFlags; validationFlags.sType = VK_STRUCTURE_TYPE_VALIDATION_FLAGS_EXT; validationFlags.pNext = 0; validationFlags.disabledValidationCheckCount = 0; validationFlags.pDisabledValidationChecks = 0; if (support_VK_EXT_validation_flags) { validationFlags.pNext = enabledExtensionFeatures; enabledExtensionFeatures = &validationFlags; } #endif // ENABLE_VALIDATION_LAYER VkInstanceCreateInfo instanceCreateInfo; instanceCreateInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; instanceCreateInfo.pNext = enabledExtensionFeatures; instanceCreateInfo.flags = 0; if (support_VK_KHR_portability_enumeration) instanceCreateInfo.flags |= VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR; instanceCreateInfo.pApplicationInfo = &applicationInfo; instanceCreateInfo.enabledLayerCount = enabledLayers.size(); instanceCreateInfo.ppEnabledLayerNames = enabledLayers.data(); instanceCreateInfo.enabledExtensionCount = enabledExtensions.size(); instanceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data(); VkInstance instance = 0; ret = vkCreateInstance(&instanceCreateInfo, 0, &instance); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateInstance failed %d", ret); return -1; } g_instance.instance = instance; g_instance.instance_api_version = instance_api_version; init_instance_core(); #if ENABLE_VALIDATION_LAYER if (support_VK_EXT_debug_utils) { VkDebugUtilsMessengerCreateInfoEXT createInfo = {}; createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT; createInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT; createInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; createInfo.pfnUserCallback = debugCallback; createInfo.pUserData = 0; ret = CreateDebugUtilsMessengerEXT(g_instance, &createInfo, NULL, &g_instance.callback); if (ret != VK_SUCCESS) { NCNN_LOGE("CreateDebugUtilsMessengerEXT failed %d", ret); return -1; } } #endif // ENABLE_VALIDATION_LAYER init_instance_extension(); uint32_t physicalDeviceCount = 0; ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, 0); if (ret != VK_SUCCESS) { NCNN_LOGE("vkEnumeratePhysicalDevices failed %d", ret); return -1; } if (physicalDeviceCount > NCNN_MAX_GPU_COUNT) physicalDeviceCount = NCNN_MAX_GPU_COUNT; std::vector physicalDevices(physicalDeviceCount); ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, physicalDevices.data()); if (ret != VK_SUCCESS) { NCNN_LOGE("vkEnumeratePhysicalDevices failed %d", ret); return -1; } // find proper device and queue int gpu_info_index = 0; for (uint32_t i = 0; i < physicalDeviceCount; i++) { const VkPhysicalDevice& physicalDevice = physicalDevices[i]; delete g_gpu_infos[gpu_info_index]; g_gpu_infos[gpu_info_index] = new GpuInfo; GpuInfo& gpu_info = *g_gpu_infos[gpu_info_index]; gpu_info.d->device_index = gpu_info_index; gpu_info.d->physicalDevice = physicalDevice; gpu_info.d->query_features(); gpu_info.d->query_properties(); // device type // info // NCNN_LOGE("[%u] max_shared_memory_size = %u", i, gpu_info.max_shared_memory_size); // NCNN_LOGE("[%u] max_workgroup_count = %u %u %u", i, gpu_info.max_workgroup_count[0], gpu_info.max_workgroup_count[1], gpu_info.max_workgroup_count[2]); // NCNN_LOGE("[%u] max_workgroup_invocations = %u", i, gpu_info.max_workgroup_invocations); // NCNN_LOGE("[%u] max_workgroup_size = %u %u %u", i, gpu_info.max_workgroup_size[0], gpu_info.max_workgroup_size[1], gpu_info.max_workgroup_size[2]); // NCNN_LOGE("[%u] memory_map_alignment = %lu", i, gpu_info.memory_map_alignment); // NCNN_LOGE("[%u] buffer_offset_alignment = %lu", i, gpu_info.buffer_offset_alignment); gpu_info.d->query_queue_properties(); // cache memory properties vkGetPhysicalDeviceMemoryProperties(physicalDevice, &gpu_info.d->physicalDeviceMemoryProperties); int rqde = gpu_info.d->query_extensions(); if (rqde != 0) { return -1; } gpu_info.d->query_extension_features(); gpu_info.d->query_extension_properties(); NCNN_LOGE("[%u %s] queueC=%u[%u] queueT=%u[%u]", i, gpu_info.device_name(), gpu_info.compute_queue_family_index(), gpu_info.compute_queue_count(), gpu_info.transfer_queue_family_index(), gpu_info.transfer_queue_count()); NCNN_LOGE("[%u %s] fp16-p/s/u/a=%d/%d/%d/%d int8-p/s/u/a=%d/%d/%d/%d", i, gpu_info.device_name(), gpu_info.support_fp16_packed(), gpu_info.support_fp16_storage(), gpu_info.support_fp16_uniform(), gpu_info.support_fp16_arithmetic(), gpu_info.support_int8_packed(), gpu_info.support_int8_storage(), gpu_info.support_int8_uniform(), gpu_info.support_int8_arithmetic()); NCNN_LOGE("[%u %s] subgroup=%u(%u~%u) ops=%d/%d/%d/%d/%d/%d/%d/%d/%d/%d", i, gpu_info.device_name(), gpu_info.subgroup_size(), gpu_info.min_subgroup_size(), gpu_info.max_subgroup_size(), (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_BASIC_BIT) != 0, (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_VOTE_BIT) != 0, (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) != 0, (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_BALLOT_BIT) != 0, (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_SHUFFLE_BIT) != 0, (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) != 0, (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_CLUSTERED_BIT) != 0, (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_QUAD_BIT) != 0, (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR) != 0, (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR) != 0); // collect matrix mnk std::vector fp16_matrix_properties; std::vector int8_matrix_properties; std::vector bf16_matrix_properties; std::vector fp8_matrix_properties; if (gpu_info.support_VK_KHR_cooperative_matrix()) { const std::vector& properties = gpu_info.queryCooperativeMatrixSubProperties(); for (uint32_t j = 0; j < properties.size(); j++) { const VkCooperativeMatrixPropertiesKHR& cmp = properties[j]; if (cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR) { bool mnk_hit = false; for (size_t k = 0; k < fp16_matrix_properties.size(); k++) { const VkCooperativeMatrixPropertiesKHR& cmp0 = fp16_matrix_properties[k]; if (cmp.MSize == cmp0.MSize && cmp.NSize == cmp0.NSize && cmp.KSize == cmp0.KSize) { mnk_hit = true; break; } } if (!mnk_hit) fp16_matrix_properties.push_back(cmp); } if ((cmp.AType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.AType == VK_COMPONENT_TYPE_SINT8_PACKED_NV) && (cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.BType == VK_COMPONENT_TYPE_SINT8_PACKED_NV)) { bool mnk_hit = false; for (size_t k = 0; k < int8_matrix_properties.size(); k++) { const VkCooperativeMatrixPropertiesKHR& cmp0 = int8_matrix_properties[k]; if (cmp.MSize == cmp0.MSize && cmp.NSize == cmp0.NSize && cmp.KSize == cmp0.KSize) { mnk_hit = true; break; } } if (!mnk_hit) int8_matrix_properties.push_back(cmp); } if (cmp.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR) { bool mnk_hit = false; for (size_t k = 0; k < bf16_matrix_properties.size(); k++) { const VkCooperativeMatrixPropertiesKHR& cmp0 = bf16_matrix_properties[k]; if (cmp.MSize == cmp0.MSize && cmp.NSize == cmp0.NSize && cmp.KSize == cmp0.KSize) { mnk_hit = true; break; } } if (!mnk_hit) bf16_matrix_properties.push_back(cmp); } if ((cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV) && (cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV)) { bool mnk_hit = false; for (size_t k = 0; k < fp8_matrix_properties.size(); k++) { const VkCooperativeMatrixPropertiesKHR& cmp0 = fp8_matrix_properties[k]; if (cmp.MSize == cmp0.MSize && cmp.NSize == cmp0.NSize && cmp.KSize == cmp0.KSize) { mnk_hit = true; break; } } if (!mnk_hit) fp8_matrix_properties.push_back(cmp); } } } else if (gpu_info.support_VK_NV_cooperative_matrix()) { const std::vector& properties = gpu_info.queryCooperativeMatrixSubPropertiesNV(); for (uint32_t j = 0; j < properties.size(); j++) { const VkCooperativeMatrixPropertiesNV& cmp = properties[j]; if (cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV) { bool mnk_hit = false; for (size_t k = 0; k < fp16_matrix_properties.size(); k++) { const VkCooperativeMatrixPropertiesKHR& cmp0 = fp16_matrix_properties[k]; if (cmp.MSize == cmp0.MSize && cmp.NSize == cmp0.NSize && cmp.KSize == cmp0.KSize) { mnk_hit = true; break; } } if (!mnk_hit) { VkCooperativeMatrixPropertiesKHR cmp_khr; cmp_khr.MSize = cmp.MSize; cmp_khr.NSize = cmp.NSize; cmp_khr.KSize = cmp.KSize; fp16_matrix_properties.push_back(cmp_khr); } } if (cmp.AType == VK_COMPONENT_TYPE_SINT8_NV && cmp.BType == VK_COMPONENT_TYPE_SINT8_NV) { bool mnk_hit = false; for (size_t k = 0; k < int8_matrix_properties.size(); k++) { const VkCooperativeMatrixPropertiesKHR& cmp0 = int8_matrix_properties[k]; if (cmp.MSize == cmp0.MSize && cmp.NSize == cmp0.NSize && cmp.KSize == cmp0.KSize) { mnk_hit = true; break; } } if (!mnk_hit) { VkCooperativeMatrixPropertiesKHR cmp_khr; cmp_khr.MSize = cmp.MSize; cmp_khr.NSize = cmp.NSize; cmp_khr.KSize = cmp.KSize; int8_matrix_properties.push_back(cmp_khr); } } } } std::string fp16_matrix_info_str; std::string int8_matrix_info_str; std::string bf16_matrix_info_str; std::string fp8_matrix_info_str; { for (uint32_t j = 0; j < fp16_matrix_properties.size(); j++) { const VkCooperativeMatrixPropertiesKHR& cmp = fp16_matrix_properties[j]; char tmp[64]; sprintf(tmp, j > 0 ? "/%ux%ux%u" : "%ux%ux%u", cmp.MSize, cmp.NSize, cmp.KSize); fp16_matrix_info_str += tmp; } for (uint32_t j = 0; j < int8_matrix_properties.size(); j++) { const VkCooperativeMatrixPropertiesKHR& cmp = int8_matrix_properties[j]; char tmp[64]; sprintf(tmp, j > 0 ? "/%ux%ux%u" : "%ux%ux%u", cmp.MSize, cmp.NSize, cmp.KSize); int8_matrix_info_str += tmp; } for (uint32_t j = 0; j < bf16_matrix_properties.size(); j++) { const VkCooperativeMatrixPropertiesKHR& cmp = bf16_matrix_properties[j]; char tmp[64]; sprintf(tmp, j > 0 ? "/%ux%ux%u" : "%ux%ux%u", cmp.MSize, cmp.NSize, cmp.KSize); bf16_matrix_info_str += tmp; } for (uint32_t j = 0; j < fp8_matrix_properties.size(); j++) { const VkCooperativeMatrixPropertiesKHR& cmp = fp8_matrix_properties[j]; char tmp[64]; sprintf(tmp, j > 0 ? "/%ux%ux%u" : "%ux%ux%u", cmp.MSize, cmp.NSize, cmp.KSize); fp8_matrix_info_str += tmp; } if (fp16_matrix_info_str.empty()) fp16_matrix_info_str = "0"; if (int8_matrix_info_str.empty()) int8_matrix_info_str = "0"; if (bf16_matrix_info_str.empty()) bf16_matrix_info_str = "0"; if (fp8_matrix_info_str.empty()) fp8_matrix_info_str = "0"; } NCNN_LOGE("[%u %s] fp16-cm=%s int8-cm=%s bf16-cm=%s fp8-cm=%s", i, gpu_info.device_name(), fp16_matrix_info_str.c_str(), int8_matrix_info_str.c_str(), bf16_matrix_info_str.c_str(), fp8_matrix_info_str.c_str()); gpu_info_index++; } g_gpu_count = gpu_info_index; // the default gpu device g_default_gpu_index = find_default_vulkan_device_index(); g_instance.glslang_initialized = glslang::InitializeProcess(); // the global __ncnn_vulkan_instance_holder destructor will call destroy_gpu_instance() on exit // but it seems to be too late for nvidia driver :( // driver's internal data structure has been destroyed when called, causing segfault // atexit() seems to be helpful for calling it earlier --- nihui static int destroy_gpu_instance_atexit_registered = 0; if (!destroy_gpu_instance_atexit_registered) { atexit(destroy_gpu_instance); destroy_gpu_instance_atexit_registered = 1; } return 0; } VkInstance get_gpu_instance() { return (VkInstance)g_instance; } void destroy_gpu_instance() { MutexLockGuard lock(g_instance_lock); if (g_instance.created == 0) return; for (int i = 0; i < NCNN_MAX_GPU_COUNT; i++) { VulkanDevice* vulkan_device = g_default_vkdev[i]; if (vulkan_device) { VkDevice vkdev = g_default_vkdev[i]->vkdevice(); if (vkdev) { vkDeviceWaitIdle(vkdev); } } } // NCNN_LOGE("destroy_gpu_instance"); if (g_instance.glslang_initialized) { glslang::FinalizeProcess(); g_instance.glslang_initialized = false; } for (int i = 0; i < NCNN_MAX_GPU_COUNT; i++) { delete g_default_vkdev[i]; g_default_vkdev[i] = 0; delete g_gpu_infos[i]; g_gpu_infos[i] = 0; } #if ENABLE_VALIDATION_LAYER if (support_VK_EXT_debug_utils && g_instance.callback) { DestroyDebugUtilsMessengerEXT(g_instance, g_instance.callback, NULL); g_instance.callback = 0; } #endif // ENABLE_VALIDATION_LAYER if (vkDestroyInstance) { vkDestroyInstance(g_instance, 0); vkDestroyInstance = 0; } g_instance.instance = 0; #if NCNN_SIMPLEVK unload_vulkan_driver(); #endif g_instance.created = 0; } static void try_create_gpu_instance() { { MutexLockGuard lock(g_instance_lock); if (g_instance.created != 0) return; } create_gpu_instance(); } int get_gpu_count() { try_create_gpu_instance(); return g_gpu_count; } int get_default_gpu_index() { try_create_gpu_instance(); return g_default_gpu_index; } const GpuInfo& get_gpu_info(int device_index) { try_create_gpu_instance(); return *g_gpu_infos[device_index]; } class VkDummyAllocator : public VkBlobAllocator { public: // NOTE 16k is large enough I think ... VkDummyAllocator(const VulkanDevice* _vkdev) : VkBlobAllocator(_vkdev, 16 * 1024) { } }; class VkDummyCompute : public VkCompute { public: VkDummyCompute(const VulkanDevice* _vkdev) : VkCompute(_vkdev) { } void record_dummy(const VkMat& buffer) { barrier_readwrite(buffer); } void record_dummy(const VkImageMat& image) { barrier_readwrite(image); } void record_dummy_readonly(const VkImageMat& image) { barrier_readonly(image); } }; class VulkanDevicePrivate { public: VulkanDevicePrivate(VulkanDevice* _vkdev); VulkanDevice* const vkdev; // dummy buffer and image int create_dummy_buffer_image(); void destroy_dummy_buffer_image(); // utility operator const ncnn::Layer* get_utility_operator(int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const; void destroy_utility_operator(); VkDevice device; // hardware queue mutable std::vector compute_queues; mutable std::vector transfer_queues; mutable int free_compute_queue_count; mutable int free_transfer_queue_count; mutable Mutex compute_queue_lock; mutable Mutex transfer_queue_lock; mutable ConditionVariable compute_queue_condition; mutable ConditionVariable transfer_queue_condition; // default blob allocator for each queue mutable std::vector blob_allocators; mutable Mutex blob_allocator_lock; // default staging allocator for each queue mutable std::vector staging_allocators; mutable Mutex staging_allocator_lock; // nearest sampler for texelfetch VkSampler texelfetch_sampler; // dummy buffer and image VkAllocator* dummy_allocator; VkMat dummy_buffer; VkImageMat dummy_image; VkImageMat dummy_image_readonly; // device-wide pipeline cache PipelineCache* pipeline_cache; // utility operator // from fp32 | fp16 // to fp32 | fp16 // to pack1 | pack4 | pack8 mutable ncnn::Layer* uop_packing[2][2][3]; // from int8 // to int8 // to pack1 | pack4 | pack8 mutable ncnn::Layer* uop_packing_int8[3]; mutable Mutex uop_lock; // device is valid and sucessfully initialized bool valid; }; VulkanDevicePrivate::VulkanDevicePrivate(VulkanDevice* _vkdev) : vkdev(_vkdev) { device = 0; texelfetch_sampler = 0; dummy_allocator = 0; pipeline_cache = 0; valid = false; memset(uop_packing, 0, sizeof(uop_packing)); memset(uop_packing_int8, 0, sizeof(uop_packing_int8)); } int VulkanDevicePrivate::create_dummy_buffer_image() { dummy_allocator = new VkDummyAllocator(vkdev); dummy_buffer.create(1, 4u, dummy_allocator); dummy_image.create(1, 4u, dummy_allocator); #if __APPLE__ if (vkdev->info.type() == 0) dummy_image_readonly.create(1, 4u, dummy_allocator); #else dummy_image_readonly.create(1, 4u, dummy_allocator); #endif VkDummyCompute cmd(vkdev); cmd.record_dummy(dummy_buffer); cmd.record_dummy(dummy_image); #if __APPLE__ if (vkdev->info.type() == 0) cmd.record_dummy_readonly(dummy_image_readonly); #else cmd.record_dummy_readonly(dummy_image_readonly); #endif return cmd.submit_and_wait(); } void VulkanDevicePrivate::destroy_dummy_buffer_image() { dummy_buffer.release(); dummy_image.release(); #if __APPLE__ if (vkdev->info.type() == 0) dummy_image_readonly.release(); #else dummy_image_readonly.release(); #endif if (dummy_allocator) { delete dummy_allocator; dummy_allocator = 0; } } const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const { bool use_fp16 = (cast_type_from_index == 1 || cast_type_to_index == 1); bool use_int8 = (cast_type_from_index == 3 || cast_type_to_index == 3); MutexLockGuard lock(uop_lock); const ncnn::Layer* cached_uop = 0; if (use_int8) { cached_uop = uop_packing_int8[packing_type_to_index]; } else { cached_uop = uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index]; } if (cached_uop) return cached_uop; // create uop Option opt; opt.use_fp16_packed = use_fp16; // fp16p is always supported opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage(); opt.use_int8_packed = use_int8; // int8p is always supported opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage(); // fp16/int8 arithmetic are not necessary for packing // and may conflict with storage options opt.use_fp16_arithmetic = false; opt.use_int8_arithmetic = false; // enable pack8 for pack8to1/pack8to4 opt.use_shader_pack8 = true; // do not enable spirv-1.3 from cooperative matrix opt.use_cooperative_matrix = false; opt.use_vulkan_compute = true; // cache uop pipeline as device member explicitly opt.pipeline_cache = 0; opt.vulkan_device_index = vkdev->info.device_index(); ncnn::Layer* uop = ncnn::create_layer_vulkan(LayerType::Packing); uop->vkdev = vkdev; ncnn::ParamDict pd; pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 : 8); // out_elempack pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16 3=int8 pd.set(3, cast_type_to_index + 1); uop->load_param(pd); uop->create_pipeline(opt); if (use_int8) { uop_packing_int8[packing_type_to_index] = uop; } else { uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop; } return uop; } void VulkanDevicePrivate::destroy_utility_operator() { Option opt; opt.use_vulkan_compute = true; opt.use_fp16_arithmetic = false; opt.use_int8_arithmetic = false; opt.use_cooperative_matrix = false; opt.pipeline_cache = 0; opt.vulkan_device_index = vkdev->info.device_index(); // from fp32 | fp16 for (int j0 = 0; j0 < 2; j0++) { // to fp32 | fp16 for (int j1 = 0; j1 < 2; j1++) { bool use_fp16 = (j0 == 1 || j1 == 1); opt.use_fp16_packed = use_fp16; opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage(); opt.use_int8_packed = false; opt.use_int8_storage = false; // to pack1 | pack4 | pack8 for (int k = 0; k < 3; k++) { // enable pack8 for pack8to1/pack8to4 opt.use_shader_pack8 = true; ncnn::Layer* uop = uop_packing[j0][j1][k]; if (!uop) continue; uop->destroy_pipeline(opt); delete uop; uop_packing[j0][j1][k] = 0; } } } // int8 { bool use_int8 = true; opt.use_fp16_packed = false; opt.use_fp16_storage = false; opt.use_int8_packed = use_int8; opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage(); // to pack1 | pack4 | pack8 for (int k = 0; k < 3; k++) { // enable pack8 for pack8to1/pack8to4 opt.use_shader_pack8 = true; ncnn::Layer* uop = uop_packing_int8[k]; if (!uop) continue; uop->destroy_pipeline(opt); delete uop; uop_packing_int8[k] = 0; } } } VulkanDevice::VulkanDevice(int device_index) : info(get_gpu_info(device_index)), d(new VulkanDevicePrivate(this)) { try_create_gpu_instance(); std::vector enabledExtensions; if (info.support_VK_KHR_8bit_storage()) enabledExtensions.push_back("VK_KHR_8bit_storage"); if (info.support_VK_KHR_16bit_storage()) enabledExtensions.push_back("VK_KHR_16bit_storage"); if (info.support_VK_KHR_bind_memory2()) enabledExtensions.push_back("VK_KHR_bind_memory2"); if (info.support_VK_KHR_buffer_device_address()) enabledExtensions.push_back("VK_KHR_buffer_device_address"); if (info.support_VK_KHR_create_renderpass2()) enabledExtensions.push_back("VK_KHR_create_renderpass2"); if (info.support_VK_KHR_cooperative_matrix()) enabledExtensions.push_back("VK_KHR_cooperative_matrix"); if (info.support_VK_KHR_dedicated_allocation()) enabledExtensions.push_back("VK_KHR_dedicated_allocation"); if (info.support_VK_KHR_descriptor_update_template()) enabledExtensions.push_back("VK_KHR_descriptor_update_template"); if (info.support_VK_KHR_driver_properties()) enabledExtensions.push_back("VK_KHR_driver_properties"); if (info.support_VK_KHR_external_memory()) enabledExtensions.push_back("VK_KHR_external_memory"); if (info.support_VK_KHR_get_memory_requirements2()) enabledExtensions.push_back("VK_KHR_get_memory_requirements2"); if (info.support_VK_KHR_maintenance1()) enabledExtensions.push_back("VK_KHR_maintenance1"); if (info.support_VK_KHR_maintenance2()) enabledExtensions.push_back("VK_KHR_maintenance2"); if (info.support_VK_KHR_maintenance3()) enabledExtensions.push_back("VK_KHR_maintenance3"); if (info.support_VK_KHR_multiview()) enabledExtensions.push_back("VK_KHR_multiview"); if (info.support_VK_KHR_portability_subset()) enabledExtensions.push_back("VK_KHR_portability_subset"); if (info.support_VK_KHR_push_descriptor()) enabledExtensions.push_back("VK_KHR_push_descriptor"); if (info.support_VK_KHR_sampler_ycbcr_conversion()) enabledExtensions.push_back("VK_KHR_sampler_ycbcr_conversion"); if (info.support_VK_KHR_shader_bfloat16()) enabledExtensions.push_back("VK_KHR_shader_bfloat16"); if (info.support_VK_KHR_shader_float16_int8()) enabledExtensions.push_back("VK_KHR_shader_float16_int8"); if (info.support_VK_KHR_shader_float_controls()) enabledExtensions.push_back("VK_KHR_shader_float_controls"); if (info.support_VK_KHR_shader_float_controls2()) enabledExtensions.push_back("VK_KHR_shader_float_controls2"); if (info.support_VK_KHR_shader_integer_dot_product()) enabledExtensions.push_back("VK_KHR_shader_integer_dot_product"); if (info.support_VK_KHR_shader_non_semantic_info()) enabledExtensions.push_back("VK_KHR_shader_non_semantic_info"); if (info.support_VK_KHR_shader_subgroup_extended_types()) enabledExtensions.push_back("VK_KHR_shader_subgroup_extended_types"); if (info.support_VK_KHR_shader_subgroup_rotate()) enabledExtensions.push_back("VK_KHR_shader_subgroup_rotate"); if (info.support_VK_KHR_storage_buffer_storage_class()) enabledExtensions.push_back("VK_KHR_storage_buffer_storage_class"); if (info.support_VK_KHR_swapchain()) enabledExtensions.push_back("VK_KHR_swapchain"); if (info.support_VK_KHR_vulkan_memory_model()) enabledExtensions.push_back("VK_KHR_vulkan_memory_model"); if (info.support_VK_KHR_zero_initialize_workgroup_memory()) enabledExtensions.push_back("VK_KHR_zero_initialize_workgroup_memory"); if (info.support_VK_EXT_buffer_device_address()) enabledExtensions.push_back("VK_EXT_buffer_device_address"); if (info.support_VK_EXT_descriptor_indexing()) enabledExtensions.push_back("VK_EXT_descriptor_indexing"); if (info.support_VK_EXT_memory_budget()) enabledExtensions.push_back("VK_EXT_memory_budget"); if (info.support_VK_EXT_memory_priority()) enabledExtensions.push_back("VK_EXT_memory_priority"); if (info.support_VK_EXT_queue_family_foreign()) enabledExtensions.push_back("VK_EXT_queue_family_foreign"); if (info.support_VK_EXT_shader_atomic_float()) enabledExtensions.push_back("VK_EXT_shader_atomic_float"); if (info.support_VK_EXT_shader_atomic_float2()) enabledExtensions.push_back("VK_EXT_shader_atomic_float2"); if (info.support_VK_EXT_shader_float8()) enabledExtensions.push_back("VK_EXT_shader_float8"); if (info.support_VK_EXT_subgroup_size_control()) enabledExtensions.push_back("VK_EXT_subgroup_size_control"); if (info.support_VK_AMD_device_coherent_memory()) enabledExtensions.push_back("VK_AMD_device_coherent_memory"); #if __ANDROID_API__ >= 26 if (info.support_VK_ANDROID_external_memory_android_hardware_buffer()) enabledExtensions.push_back("VK_ANDROID_external_memory_android_hardware_buffer"); #endif // __ANDROID_API__ >= 26 if (info.support_VK_NV_cooperative_matrix()) enabledExtensions.push_back("VK_NV_cooperative_matrix"); if (info.support_VK_NV_cooperative_matrix2()) enabledExtensions.push_back("VK_NV_cooperative_matrix2"); if (info.support_VK_NV_cooperative_vector()) enabledExtensions.push_back("VK_NV_cooperative_vector"); const void* enabledExtensionFeatures = info.queryExtensionFeatures(); std::vector compute_queue_priorities(info.compute_queue_count(), 1.f); // 0.f ~ 1.f std::vector transfer_queue_priorities(info.transfer_queue_count(), 1.f); // 0.f ~ 1.f VkDeviceQueueCreateInfo deviceQueueCreateInfos[3]; VkDeviceQueueCreateInfo deviceComputeQueueCreateInfo; deviceComputeQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; deviceComputeQueueCreateInfo.pNext = 0; deviceComputeQueueCreateInfo.flags = 0; deviceComputeQueueCreateInfo.queueFamilyIndex = info.compute_queue_family_index(); deviceComputeQueueCreateInfo.queueCount = info.compute_queue_count(); deviceComputeQueueCreateInfo.pQueuePriorities = compute_queue_priorities.data(); VkDeviceQueueCreateInfo deviceTransferQueueCreateInfo; deviceTransferQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; deviceTransferQueueCreateInfo.pNext = 0; deviceTransferQueueCreateInfo.flags = 0; deviceTransferQueueCreateInfo.queueFamilyIndex = info.transfer_queue_family_index(); deviceTransferQueueCreateInfo.queueCount = info.transfer_queue_count(); deviceTransferQueueCreateInfo.pQueuePriorities = transfer_queue_priorities.data(); VkDeviceCreateInfo deviceCreateInfo; deviceCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; deviceCreateInfo.pNext = enabledExtensionFeatures; deviceCreateInfo.flags = 0; if (info.compute_queue_family_index() == info.transfer_queue_family_index()) { deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo; deviceCreateInfo.queueCreateInfoCount = 1; } else // if (info.compute_queue_family_index() != info.transfer_queue_family_index()) { deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo; deviceQueueCreateInfos[1] = deviceTransferQueueCreateInfo; deviceCreateInfo.queueCreateInfoCount = 2; } deviceCreateInfo.pQueueCreateInfos = deviceQueueCreateInfos; deviceCreateInfo.enabledLayerCount = 0; deviceCreateInfo.ppEnabledLayerNames = 0; deviceCreateInfo.enabledExtensionCount = enabledExtensions.size(); deviceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data(); deviceCreateInfo.pEnabledFeatures = 0; // VkPhysicalDeviceFeatures pointer VkResult ret = vkCreateDevice(info.physicalDevice(), &deviceCreateInfo, 0, &d->device); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateDevice failed %d", ret); return; } init_device_extension(); d->free_compute_queue_count = 0; d->free_transfer_queue_count = 0; d->free_compute_queue_count = info.compute_queue_count(); d->compute_queues.resize(info.compute_queue_count()); d->blob_allocators.resize(info.compute_queue_count()); d->staging_allocators.resize(info.compute_queue_count()); for (uint32_t i = 0; i < info.compute_queue_count(); i++) { vkGetDeviceQueue(d->device, info.compute_queue_family_index(), i, &d->compute_queues[i]); d->blob_allocators[i] = new VkBlobAllocator(this); d->staging_allocators[i] = new VkStagingAllocator(this); } if (info.compute_queue_family_index() != info.transfer_queue_family_index()) { d->free_transfer_queue_count = info.transfer_queue_count(); d->transfer_queues.resize(info.transfer_queue_count()); for (uint32_t i = 0; i < info.transfer_queue_count(); i++) { vkGetDeviceQueue(d->device, info.transfer_queue_family_index(), i, &d->transfer_queues[i]); } } // prepare immutable texelfetch sampler { VkSamplerCreateInfo samplerCreateInfo; samplerCreateInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; samplerCreateInfo.pNext = 0; samplerCreateInfo.flags = 0; samplerCreateInfo.magFilter = VK_FILTER_NEAREST; samplerCreateInfo.minFilter = VK_FILTER_NEAREST; samplerCreateInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST; samplerCreateInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; samplerCreateInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; samplerCreateInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; samplerCreateInfo.mipLodBias = 0.0f; samplerCreateInfo.anisotropyEnable = VK_FALSE; samplerCreateInfo.maxAnisotropy = 1; samplerCreateInfo.compareEnable = VK_FALSE; samplerCreateInfo.compareOp = VK_COMPARE_OP_NEVER; samplerCreateInfo.minLod = 0.0f; samplerCreateInfo.maxLod = 0.0f; samplerCreateInfo.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; samplerCreateInfo.unnormalizedCoordinates = VK_TRUE; ret = vkCreateSampler(d->device, &samplerCreateInfo, 0, &d->texelfetch_sampler); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateSampler failed %d", ret); } } int cret = d->create_dummy_buffer_image(); if (cret != 0) { NCNN_LOGE("VulkanDevice create_dummy_buffer_image failed %d", cret); return; } d->pipeline_cache = new PipelineCache(this); d->valid = true; } VulkanDevice::~VulkanDevice() { d->destroy_utility_operator(); d->destroy_dummy_buffer_image(); if (d->texelfetch_sampler) { vkDestroySampler(d->device, d->texelfetch_sampler, 0); } for (size_t i = 0; i < d->blob_allocators.size(); i++) { delete d->blob_allocators[i]; } d->blob_allocators.clear(); for (size_t i = 0; i < d->staging_allocators.size(); i++) { delete d->staging_allocators[i]; } d->staging_allocators.clear(); if (d->pipeline_cache) { delete d->pipeline_cache; } if (d->device) { vkDestroyDevice(d->device, 0); } delete d; } VulkanDevice::VulkanDevice(const VulkanDevice&) : info(get_gpu_info(0)), d(0) { } VulkanDevice& VulkanDevice::operator=(const VulkanDevice&) { return *this; } VkDevice VulkanDevice::vkdevice() const { return d->device; } bool VulkanDevice::is_valid() const { return d->valid; } VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const { VkShaderModuleCreateInfo shaderModuleCreateInfo; shaderModuleCreateInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; shaderModuleCreateInfo.pNext = 0; shaderModuleCreateInfo.flags = 0; shaderModuleCreateInfo.codeSize = spv_data_size; shaderModuleCreateInfo.pCode = spv_data; VkShaderModule shader_module; VkResult ret = vkCreateShaderModule(d->device, &shaderModuleCreateInfo, 0, &shader_module); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateShaderModule failed %d", ret); return 0; } return shader_module; } static void inject_local_size_xyz(const uint32_t* code, size_t size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t* dstcode, size_t* dstsize) { uint32_t local_size_x_id = -1; uint32_t local_size_y_id = -1; uint32_t local_size_z_id = -1; uint32_t gl_WorkGroupSize_id = -1; const uint32_t* p = code; uint32_t* dp = dstcode; // skip magic version generator bound schema memcpy(dp, p, 5 * sizeof(uint32_t)); p += 5; dp += 5; // foreach op while ((const unsigned char*)p < (const unsigned char*)code + size) { uint32_t opcode = p[0]; uint16_t wordcount = opcode >> 16; uint16_t op = opcode & 0xffff; if (op == 16) // OpExecutionMode { uint32_t mode = p[2]; if (mode == 17) // LocalSize { memcpy(dp, p, wordcount * sizeof(uint32_t)); // set local_size_xyz dp[3] = local_size_x; dp[4] = local_size_y; dp[5] = local_size_z; p += wordcount; dp += wordcount; continue; } } else if (op == 50) // OpSpecConstant { uint32_t id = p[2]; if (id == local_size_x_id || id == local_size_y_id || id == local_size_z_id) { p += wordcount; continue; } } else if (op == 51) // OpSpecConstantComposite { uint32_t id = p[2]; if (id == gl_WorkGroupSize_id) { if (wordcount == 6 && (p[3] == local_size_x_id || p[4] == local_size_y_id || p[5] == local_size_z_id)) { p += wordcount; continue; } } } else if (op == 71) // OpDecorate { uint32_t id = p[1]; uint32_t decoration = p[2]; if (decoration == 1) // SpecId { uint32_t specid = p[3]; if (specid == 233) local_size_x_id = id; if (specid == 234) local_size_y_id = id; if (specid == 235) local_size_z_id = id; if (specid == 233 || specid == 234 || specid == 235) { p += wordcount; continue; } } else if (decoration == 11) // BuiltIn { uint32_t builtin = p[3]; if (builtin == 25) // WorkgroupSize { gl_WorkGroupSize_id = id; p += wordcount; continue; } } } memcpy(dp, p, wordcount * sizeof(uint32_t)); p += wordcount; dp += wordcount; } *dstsize = (unsigned char*)dp - (unsigned char*)dstcode; } VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const { uint32_t* spv_data_modified = (uint32_t*)malloc(spv_data_size); size_t spv_data_size_modified = spv_data_size; inject_local_size_xyz(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z, spv_data_modified, &spv_data_size_modified); VkShaderModule shader_module = compile_shader_module(spv_data_modified, spv_data_size_modified); free(spv_data_modified); return shader_module; } int VulkanDevice::create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const { if (binding_count == 0) { *descriptorset_layout = 0; return 0; } std::vector descriptorSetLayoutBindings(binding_count); for (int i = 0; i < binding_count; i++) { int binding_type = binding_types[i]; descriptorSetLayoutBindings[i].binding = i; descriptorSetLayoutBindings[i].descriptorCount = 1; descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; if (binding_type == 1) { descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; descriptorSetLayoutBindings[i].pImmutableSamplers = 0; } else if (binding_type == 2) { descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; descriptorSetLayoutBindings[i].pImmutableSamplers = 0; } else // if (binding_type == 3) { descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; descriptorSetLayoutBindings[i].pImmutableSamplers = immutable_texelfetch_sampler(); // we always use texelfetch } } VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo; descriptorSetLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; descriptorSetLayoutCreateInfo.pNext = 0; descriptorSetLayoutCreateInfo.flags = 0; descriptorSetLayoutCreateInfo.bindingCount = binding_count; descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings.data(); if (info.support_VK_KHR_push_descriptor()) { descriptorSetLayoutCreateInfo.flags |= VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR; } VkResult ret = vkCreateDescriptorSetLayout(d->device, &descriptorSetLayoutCreateInfo, 0, descriptorset_layout); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateDescriptorSetLayout failed %d", ret); return -1; } return 0; } int VulkanDevice::create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const { VkPushConstantRange pushConstantRange; pushConstantRange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; pushConstantRange.offset = 0; pushConstantRange.size = sizeof(vk_constant_type) * push_constant_count; VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo; pipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; pipelineLayoutCreateInfo.pNext = 0; pipelineLayoutCreateInfo.flags = 0; if (descriptorset_layout) { pipelineLayoutCreateInfo.setLayoutCount = 1; pipelineLayoutCreateInfo.pSetLayouts = &descriptorset_layout; } else { pipelineLayoutCreateInfo.setLayoutCount = 0; pipelineLayoutCreateInfo.pSetLayouts = 0; } if (push_constant_count > 0) { pipelineLayoutCreateInfo.pushConstantRangeCount = 1; pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange; } else { pipelineLayoutCreateInfo.pushConstantRangeCount = 0; pipelineLayoutCreateInfo.pPushConstantRanges = 0; } VkResult ret = vkCreatePipelineLayout(d->device, &pipelineLayoutCreateInfo, 0, pipeline_layout); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreatePipelineLayout failed %d", ret); return -1; } return 0; } int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector& specializations, uint32_t subgroup_size, VkPipeline* pipeline) const { const int specialization_count = specializations.size(); std::vector specializationMapEntries(specialization_count); for (int i = 0; i < specialization_count; i++) { specializationMapEntries[i].constantID = i; specializationMapEntries[i].offset = i * sizeof(vk_specialization_type); specializationMapEntries[i].size = sizeof(vk_specialization_type); } VkSpecializationInfo specializationInfo; specializationInfo.mapEntryCount = specializationMapEntries.size(); specializationInfo.pMapEntries = specializationMapEntries.data(); specializationInfo.dataSize = specializations.size() * sizeof(vk_specialization_type); specializationInfo.pData = specializations.data(); VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo; pipelineShaderStageCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; pipelineShaderStageCreateInfo.pNext = 0; pipelineShaderStageCreateInfo.flags = 0; pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT; pipelineShaderStageCreateInfo.module = shader_module; pipelineShaderStageCreateInfo.pName = "main"; pipelineShaderStageCreateInfo.pSpecializationInfo = &specializationInfo; // but full subgroup bits enforce local_size_x be multiple of subgroup size // if (info.support_compute_full_subgroups()) // { // pipelineShaderStageCreateInfo.flags |= VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT; // } void* enabledExtensionFeatures = 0; // subgroup size control VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT pipelineShaderStageRequiredSubgroupSizeCreateInfo; pipelineShaderStageRequiredSubgroupSizeCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT; pipelineShaderStageRequiredSubgroupSizeCreateInfo.pNext = 0; pipelineShaderStageRequiredSubgroupSizeCreateInfo.requiredSubgroupSize = subgroup_size; if (info.support_subgroup_size_control()) { // pipelineShaderStageCreateInfo.flags |= VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT; pipelineShaderStageRequiredSubgroupSizeCreateInfo.pNext = enabledExtensionFeatures; enabledExtensionFeatures = &pipelineShaderStageRequiredSubgroupSizeCreateInfo; } pipelineShaderStageCreateInfo.pNext = enabledExtensionFeatures; VkComputePipelineCreateInfo computePipelineCreateInfo; computePipelineCreateInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; computePipelineCreateInfo.pNext = 0; computePipelineCreateInfo.flags = 0; computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo; computePipelineCreateInfo.layout = pipeline_layout; computePipelineCreateInfo.basePipelineHandle = 0; computePipelineCreateInfo.basePipelineIndex = 0; VkResult ret = vkCreateComputePipelines(d->device, 0, 1, &computePipelineCreateInfo, 0, pipeline); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateComputePipelines failed %d", ret); return -1; } return 0; } int VulkanDevice::create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const { if (binding_count == 0) { *descriptor_update_template = 0; return 0; } std::vector descriptorUpdateTemplateEntries(binding_count); size_t offset = 0; for (int i = 0; i < binding_count; i++) // TODO do not update weights { int binding_type = binding_types[i]; descriptorUpdateTemplateEntries[i].dstBinding = i; descriptorUpdateTemplateEntries[i].dstArrayElement = 0; descriptorUpdateTemplateEntries[i].descriptorCount = 1; descriptorUpdateTemplateEntries[i].offset = offset; if (binding_type == 1) { descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorBufferInfo); } else if (binding_type == 2) { descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorImageInfo); } else // if (binding_type == 3) { descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorImageInfo); } offset += descriptorUpdateTemplateEntries[i].stride; } VkDescriptorUpdateTemplateCreateInfoKHR descriptorUpdateTemplateCreateInfo; descriptorUpdateTemplateCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR; descriptorUpdateTemplateCreateInfo.pNext = 0; descriptorUpdateTemplateCreateInfo.flags = 0; descriptorUpdateTemplateCreateInfo.descriptorUpdateEntryCount = binding_count; // TODO do not update weights descriptorUpdateTemplateCreateInfo.pDescriptorUpdateEntries = descriptorUpdateTemplateEntries.data(); if (info.support_VK_KHR_push_descriptor()) { descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR; } else { descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR; } // descriptorSetLayout should be ignored if VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR // FIXME HACK WARNING TODO NOTE but crash on radv if set NULL :( descriptorUpdateTemplateCreateInfo.descriptorSetLayout = descriptorset_layout; descriptorUpdateTemplateCreateInfo.pipelineBindPoint = VK_PIPELINE_BIND_POINT_COMPUTE; descriptorUpdateTemplateCreateInfo.pipelineLayout = pipeline_layout; descriptorUpdateTemplateCreateInfo.set = 0; VkResult ret = vkCreateDescriptorUpdateTemplateKHR(d->device, &descriptorUpdateTemplateCreateInfo, 0, descriptor_update_template); if (ret != VK_SUCCESS) { NCNN_LOGE("vkCreateDescriptorUpdateTemplateKHR failed %d", ret); return -1; } return 0; } uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const { const VkPhysicalDeviceMemoryProperties& memory_properties = info.physicalDeviceMemoryProperties(); // first try, find required and with preferred and without preferred_not for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++) { bool is_required = (1 << i) & memory_type_bits; if (is_required) { const VkMemoryType& memoryType = memory_properties.memoryTypes[i]; if ((memoryType.propertyFlags & required) == required && (preferred && (memoryType.propertyFlags & preferred)) && (preferred_not && !(memoryType.propertyFlags & preferred_not))) { return i; } } } // second try, find required and with preferred for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++) { bool is_required = (1 << i) & memory_type_bits; if (is_required) { const VkMemoryType& memoryType = memory_properties.memoryTypes[i]; if ((memoryType.propertyFlags & required) == required && (preferred && (memoryType.propertyFlags & preferred))) { return i; } } } // third try, find required and without preferred_not for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++) { bool is_required = (1 << i) & memory_type_bits; if (is_required) { const VkMemoryType& memoryType = memory_properties.memoryTypes[i]; if ((memoryType.propertyFlags & required) == required && (preferred_not && !(memoryType.propertyFlags & preferred_not))) { return i; } } } // fourth try, find any required for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++) { bool is_required = (1 << i) & memory_type_bits; if (is_required) { const VkMemoryType& memoryType = memory_properties.memoryTypes[i]; if ((memoryType.propertyFlags & required) == required) { return i; } } } NCNN_LOGE("no such memory type %u %u %u %u", memory_type_bits, required, preferred, preferred_not); return -1; } bool VulkanDevice::is_mappable(uint32_t memory_type_index) const { const VkMemoryType& memoryType = info.physicalDeviceMemoryProperties().memoryTypes[memory_type_index]; return memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; } bool VulkanDevice::is_coherent(uint32_t memory_type_index) const { const VkMemoryType& memoryType = info.physicalDeviceMemoryProperties().memoryTypes[memory_type_index]; return memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; } VkQueue VulkanDevice::acquire_queue(uint32_t queue_family_index) const { if (queue_family_index != info.compute_queue_family_index() && queue_family_index != info.transfer_queue_family_index()) { NCNN_LOGE("invalid queue_family_index %u", queue_family_index); return 0; } Mutex& queue_lock = queue_family_index == info.compute_queue_family_index() ? d->compute_queue_lock : d->transfer_queue_lock; queue_lock.lock(); ConditionVariable& queue_condition = queue_family_index == info.compute_queue_family_index() ? d->compute_queue_condition : d->transfer_queue_condition; int& free_queue_count = queue_family_index == info.compute_queue_family_index() ? d->free_compute_queue_count : d->free_transfer_queue_count; while (free_queue_count == 0) { // no free queues, wait for recleams from other threads queue_condition.wait(queue_lock); } std::vector& queues = queue_family_index == info.compute_queue_family_index() ? d->compute_queues : d->transfer_queues; VkQueue queue = 0; for (size_t i = 0; i < queues.size(); i++) { if (queues[i]) { queue = queues[i]; queues[i] = 0; break; } } if (!queue) { NCNN_LOGE("FATAL ERROR! out of hardware queue %u", queue_family_index); } free_queue_count -= 1; queue_lock.unlock(); queue_condition.signal(); return queue; } void VulkanDevice::reclaim_queue(uint32_t queue_family_index, VkQueue queue) const { if (queue_family_index != info.compute_queue_family_index() && queue_family_index != info.transfer_queue_family_index()) { NCNN_LOGE("invalid queue_family_index %u", queue_family_index); return; } Mutex& queue_lock = queue_family_index == info.compute_queue_family_index() ? d->compute_queue_lock : d->transfer_queue_lock; queue_lock.lock(); ConditionVariable& queue_condition = queue_family_index == info.compute_queue_family_index() ? d->compute_queue_condition : d->transfer_queue_condition; int& free_queue_count = queue_family_index == info.compute_queue_family_index() ? d->free_compute_queue_count : d->free_transfer_queue_count; std::vector& queues = queue_family_index == info.compute_queue_family_index() ? d->compute_queues : d->transfer_queues; size_t i = 0; for (; i < queues.size(); i++) { if (!queues[i]) { queues[i] = queue; break; } } if (i == queues.size()) { NCNN_LOGE("FATAL ERROR! reclaim_queue get wild queue %u %p", queue_family_index, queue); } free_queue_count += 1; queue_lock.unlock(); queue_condition.signal(); } VkAllocator* VulkanDevice::acquire_blob_allocator() const { MutexLockGuard lock(d->blob_allocator_lock); for (int i = 0; i < (int)d->blob_allocators.size(); i++) { VkAllocator* allocator = d->blob_allocators[i]; if (allocator) { d->blob_allocators[i] = 0; return allocator; } } // pre-allocated allcator exhausted, create new VkAllocator* allocator = new VkBlobAllocator(this); d->blob_allocators.push_back(allocator); d->blob_allocators[d->blob_allocators.size() - 1] = 0; return allocator; } void VulkanDevice::reclaim_blob_allocator(VkAllocator* allocator) const { MutexLockGuard lock(d->blob_allocator_lock); for (int i = 0; i < (int)d->blob_allocators.size(); i++) { if (!d->blob_allocators[i]) { d->blob_allocators[i] = allocator; return; } } NCNN_LOGE("FATAL ERROR! reclaim_blob_allocator get wild allocator %p", allocator); } VkAllocator* VulkanDevice::acquire_staging_allocator() const { MutexLockGuard lock(d->staging_allocator_lock); for (int i = 0; i < (int)d->staging_allocators.size(); i++) { VkAllocator* allocator = d->staging_allocators[i]; if (allocator) { d->staging_allocators[i] = 0; return allocator; } } // pre-allocated allcator exhausted, create new VkAllocator* allocator = new VkStagingAllocator(this); d->staging_allocators.push_back(allocator); d->staging_allocators[d->staging_allocators.size() - 1] = 0; return allocator; } void VulkanDevice::reclaim_staging_allocator(VkAllocator* allocator) const { MutexLockGuard lock(d->staging_allocator_lock); for (int i = 0; i < (int)d->staging_allocators.size(); i++) { if (!d->staging_allocators[i]) { d->staging_allocators[i] = allocator; return; } } NCNN_LOGE("FATAL ERROR! reclaim_staging_allocator get wild allocator %p", allocator); } const VkSampler* VulkanDevice::immutable_texelfetch_sampler() const { return &d->texelfetch_sampler; } VkMat VulkanDevice::get_dummy_buffer() const { return d->dummy_buffer; } VkImageMat VulkanDevice::get_dummy_image() const { return d->dummy_image; } VkImageMat VulkanDevice::get_dummy_image_readonly() const { #if __APPLE__ if (info.type() != 0) return d->dummy_image; #endif return d->dummy_image_readonly; } const PipelineCache* VulkanDevice::get_pipeline_cache() const { return d->pipeline_cache; } bool VulkanDevice::shape_support_image_storage(const Mat& shape) const { int dims = shape.dims; int width = shape.w; int height = shape.h; int depth = shape.c; int elempack = shape.elempack; // large elempack spills on image w if (elempack == 8) width *= 2; if (elempack == 16) width *= 4; if (elempack == 32) width *= 8; if (elempack == 64) width *= 16; if (dims == 1) { if (width > (int)info.max_image_dimension_1d()) { return false; } } else if (dims == 2) { if (width > (int)info.max_image_dimension_2d() || height > (int)info.max_image_dimension_2d()) { return false; } } else // if (dims == 3) { if (width > (int)info.max_image_dimension_3d() || height > (int)info.max_image_dimension_3d() || depth > (int)info.max_image_dimension_3d()) { return false; } } return true; } uint32_t VulkanDevice::get_heap_budget() const { const VkPhysicalDeviceMemoryProperties& memory_properties = info.physicalDeviceMemoryProperties(); uint32_t buffer_memory_type_index = d->dummy_allocator->buffer_memory_type_index; uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex; if (!info.support_VK_EXT_memory_budget()) { // NCNN_LOGE("heap budget from assumption\n"); uint32_t device_local_heap_size = memory_properties.memoryHeaps[buffer_heap_index].size / 1024 / 1024; // we usually cannot use all heap // 70% for 4G+ // 50% for 4G- return device_local_heap_size >= 4000 ? device_local_heap_size * 0.7 : device_local_heap_size * 0.5; } VkPhysicalDeviceMemoryBudgetPropertiesEXT memoryBudgetProperties; memoryBudgetProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT; memoryBudgetProperties.pNext = 0; VkPhysicalDeviceMemoryProperties2KHR memoryProperties; memoryProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2_KHR; memoryProperties.pNext = &memoryBudgetProperties; vkGetPhysicalDeviceMemoryProperties2KHR(info.physicalDevice(), &memoryProperties); return memoryBudgetProperties.heapBudget[buffer_heap_index] / 1024 / 1024; } void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const { convert_packing(src, dst, dst_elempack, 0, cmd, opt); } void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, int cast_type_to, VkCompute& cmd, const Option& opt) const { int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2; int cast_type_from_index; if (src.elembits() == 32) { cast_type_from_index = 0; } else if (src.elembits() == 16) { cast_type_from_index = 1; } else // if (src.elembits() == 8) { cast_type_from_index = 3; } int cast_type_to_index = cast_type_to ? cast_type_to - 1 : cast_type_from_index; // NCNN_LOGE("convert_packing b2b %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index); if ((cast_type_from_index == 0 || cast_type_from_index == 1) && (cast_type_to_index == 2 || cast_type_to_index == 3)) { NCNN_LOGE("convert_packing from fp32/fp16 to int32/int8 is not supported"); return; } if ((cast_type_from_index == 2 || cast_type_from_index == 3) && (cast_type_to_index == 0 || cast_type_to_index == 1)) { NCNN_LOGE("convert_packing from int32/int8 to fp32/fp16 is not supported"); return; } Option opt2 = opt; opt2.use_fp16_packed = (cast_type_from_index == 1 || cast_type_to_index == 1); opt2.use_fp16_storage = (cast_type_from_index == 1 || cast_type_to_index == 1) && info.support_fp16_storage(); opt2.use_int8_packed = (cast_type_from_index == 3 || cast_type_to_index == 3); opt2.use_int8_storage = (cast_type_from_index == 3 || cast_type_to_index == 3) && info.support_int8_storage(); const ncnn::Layer* uop = d->get_utility_operator(cast_type_from_index, cast_type_to_index, packing_type_to_index); uop->forward(src, dst, cmd, opt2); } int VulkanDevice::init_device_extension() { if (info.support_VK_KHR_bind_memory2()) { vkBindBufferMemory2KHR = (PFN_vkBindBufferMemory2KHR)vkGetDeviceProcAddr(d->device, "vkBindBufferMemory2KHR"); vkBindImageMemory2KHR = (PFN_vkBindImageMemory2KHR)vkGetDeviceProcAddr(d->device, "vkBindImageMemory2KHR"); } if (info.support_VK_KHR_buffer_device_address()) { vkGetBufferDeviceAddressKHR = (PFN_vkGetBufferDeviceAddressKHR)vkGetDeviceProcAddr(d->device, "vkGetBufferDeviceAddressKHR"); vkGetBufferOpaqueCaptureAddressKHR = (PFN_vkGetBufferOpaqueCaptureAddressKHR)vkGetDeviceProcAddr(d->device, "vkGetBufferOpaqueCaptureAddressKHR"); vkGetDeviceMemoryOpaqueCaptureAddressKHR = (PFN_vkGetDeviceMemoryOpaqueCaptureAddressKHR)vkGetDeviceProcAddr(d->device, "vkGetDeviceMemoryOpaqueCaptureAddressKHR"); } if (info.support_VK_KHR_descriptor_update_template()) { vkCreateDescriptorUpdateTemplateKHR = (PFN_vkCreateDescriptorUpdateTemplateKHR)vkGetDeviceProcAddr(d->device, "vkCreateDescriptorUpdateTemplateKHR"); vkDestroyDescriptorUpdateTemplateKHR = (PFN_vkDestroyDescriptorUpdateTemplateKHR)vkGetDeviceProcAddr(d->device, "vkDestroyDescriptorUpdateTemplateKHR"); vkUpdateDescriptorSetWithTemplateKHR = (PFN_vkUpdateDescriptorSetWithTemplateKHR)vkGetDeviceProcAddr(d->device, "vkUpdateDescriptorSetWithTemplateKHR"); } if (info.support_VK_KHR_get_memory_requirements2()) { vkGetImageMemoryRequirements2KHR = (PFN_vkGetImageMemoryRequirements2KHR)vkGetDeviceProcAddr(d->device, "vkGetImageMemoryRequirements2KHR"); vkGetBufferMemoryRequirements2KHR = (PFN_vkGetBufferMemoryRequirements2KHR)vkGetDeviceProcAddr(d->device, "vkGetBufferMemoryRequirements2KHR"); } if (info.support_VK_KHR_maintenance1()) { vkTrimCommandPoolKHR = (PFN_vkTrimCommandPoolKHR)vkGetDeviceProcAddr(d->device, "vkTrimCommandPoolKHR"); } if (info.support_VK_KHR_maintenance3()) { vkGetDescriptorSetLayoutSupportKHR = (PFN_vkGetDescriptorSetLayoutSupportKHR)vkGetDeviceProcAddr(d->device, "vkGetDescriptorSetLayoutSupportKHR"); } if (info.support_VK_KHR_push_descriptor()) { if (info.support_VK_KHR_descriptor_update_template()) { vkCmdPushDescriptorSetWithTemplateKHR = (PFN_vkCmdPushDescriptorSetWithTemplateKHR)vkGetDeviceProcAddr(d->device, "vkCmdPushDescriptorSetWithTemplateKHR"); } vkCmdPushDescriptorSetKHR = (PFN_vkCmdPushDescriptorSetKHR)vkGetDeviceProcAddr(d->device, "vkCmdPushDescriptorSetKHR"); } if (info.support_VK_KHR_sampler_ycbcr_conversion()) { vkCreateSamplerYcbcrConversionKHR = (PFN_vkCreateSamplerYcbcrConversionKHR)vkGetDeviceProcAddr(d->device, "vkCreateSamplerYcbcrConversionKHR"); vkDestroySamplerYcbcrConversionKHR = (PFN_vkDestroySamplerYcbcrConversionKHR)vkGetDeviceProcAddr(d->device, "vkDestroySamplerYcbcrConversionKHR"); } if (info.support_VK_KHR_swapchain()) { vkCreateSwapchainKHR = (PFN_vkCreateSwapchainKHR)vkGetDeviceProcAddr(d->device, "vkCreateSwapchainKHR"); vkDestroySwapchainKHR = (PFN_vkDestroySwapchainKHR)vkGetDeviceProcAddr(d->device, "vkDestroySwapchainKHR"); vkGetSwapchainImagesKHR = (PFN_vkGetSwapchainImagesKHR)vkGetDeviceProcAddr(d->device, "vkGetSwapchainImagesKHR"); vkAcquireNextImageKHR = (PFN_vkAcquireNextImageKHR)vkGetDeviceProcAddr(d->device, "vkAcquireNextImageKHR"); vkQueuePresentKHR = (PFN_vkQueuePresentKHR)vkGetDeviceProcAddr(d->device, "vkQueuePresentKHR"); } if (info.support_VK_EXT_buffer_device_address()) { vkGetBufferDeviceAddressEXT = (PFN_vkGetBufferDeviceAddressEXT)vkGetDeviceProcAddr(d->device, "vkGetBufferDeviceAddressEXT"); } #if __ANDROID_API__ >= 26 if (info.support_VK_ANDROID_external_memory_android_hardware_buffer()) { vkGetAndroidHardwareBufferPropertiesANDROID = (PFN_vkGetAndroidHardwareBufferPropertiesANDROID)vkGetDeviceProcAddr(d->device, "vkGetAndroidHardwareBufferPropertiesANDROID"); vkGetMemoryAndroidHardwareBufferANDROID = (PFN_vkGetMemoryAndroidHardwareBufferANDROID)vkGetDeviceProcAddr(d->device, "vkGetMemoryAndroidHardwareBufferANDROID"); } #endif // __ANDROID_API__ >= 26 if (info.support_VK_NV_cooperative_vector()) { vkCmdConvertCooperativeVectorMatrixNV = (PFN_vkCmdConvertCooperativeVectorMatrixNV)vkGetDeviceProcAddr(d->device, "vkCmdConvertCooperativeVectorMatrixNV"); vkConvertCooperativeVectorMatrixNV = (PFN_vkConvertCooperativeVectorMatrixNV)vkGetDeviceProcAddr(d->device, "vkConvertCooperativeVectorMatrixNV"); } return 0; } VulkanDevice* get_gpu_device(int device_index) { try_create_gpu_instance(); if (device_index < 0 || device_index >= g_gpu_count) return 0; MutexLockGuard lock(g_default_vkdev_lock); if (!g_default_vkdev[device_index]) g_default_vkdev[device_index] = new VulkanDevice(device_index); return g_default_vkdev[device_index]; } static TBuiltInResource get_default_TBuiltInResource() { TBuiltInResource resource; resource.maxLights = 32; resource.maxClipPlanes = 6; resource.maxTextureUnits = 32; resource.maxTextureCoords = 32; resource.maxVertexAttribs = 64; resource.maxVertexUniformComponents = 4096; resource.maxVaryingFloats = 64; resource.maxVertexTextureImageUnits = 32; resource.maxCombinedTextureImageUnits = 80; resource.maxTextureImageUnits = 32; resource.maxFragmentUniformComponents = 4096; resource.maxDrawBuffers = 32; resource.maxVertexUniformVectors = 128; resource.maxVaryingVectors = 8; resource.maxFragmentUniformVectors = 16; resource.maxVertexOutputVectors = 16; resource.maxFragmentInputVectors = 15; resource.minProgramTexelOffset = -8; resource.maxProgramTexelOffset = 7; resource.maxClipDistances = 8; resource.maxComputeWorkGroupCountX = 65535; resource.maxComputeWorkGroupCountY = 65535; resource.maxComputeWorkGroupCountZ = 65535; resource.maxComputeWorkGroupSizeX = 1024; resource.maxComputeWorkGroupSizeY = 1024; resource.maxComputeWorkGroupSizeZ = 64; resource.maxComputeUniformComponents = 1024; resource.maxComputeTextureImageUnits = 16; resource.maxComputeImageUniforms = 8; resource.maxComputeAtomicCounters = 8; resource.maxComputeAtomicCounterBuffers = 1; resource.maxVaryingComponents = 60; resource.maxVertexOutputComponents = 64; resource.maxGeometryInputComponents = 64; resource.maxGeometryOutputComponents = 128; resource.maxFragmentInputComponents = 128; resource.maxImageUnits = 8; resource.maxCombinedImageUnitsAndFragmentOutputs = 8; resource.maxCombinedShaderOutputResources = 8; resource.maxImageSamples = 0; resource.maxVertexImageUniforms = 0; resource.maxTessControlImageUniforms = 0; resource.maxTessEvaluationImageUniforms = 0; resource.maxGeometryImageUniforms = 0; resource.maxFragmentImageUniforms = 8; resource.maxCombinedImageUniforms = 8; resource.maxGeometryTextureImageUnits = 16; resource.maxGeometryOutputVertices = 256; resource.maxGeometryTotalOutputComponents = 1024; resource.maxGeometryUniformComponents = 1024; resource.maxGeometryVaryingComponents = 64; resource.maxTessControlInputComponents = 128; resource.maxTessControlOutputComponents = 128; resource.maxTessControlTextureImageUnits = 16; resource.maxTessControlUniformComponents = 1024; resource.maxTessControlTotalOutputComponents = 4096; resource.maxTessEvaluationInputComponents = 128; resource.maxTessEvaluationOutputComponents = 128; resource.maxTessEvaluationTextureImageUnits = 16; resource.maxTessEvaluationUniformComponents = 1024; resource.maxTessPatchComponents = 120; resource.maxPatchVertices = 32; resource.maxTessGenLevel = 64; resource.maxViewports = 16; resource.maxVertexAtomicCounters = 0; resource.maxTessControlAtomicCounters = 0; resource.maxTessEvaluationAtomicCounters = 0; resource.maxGeometryAtomicCounters = 0; resource.maxFragmentAtomicCounters = 8; resource.maxCombinedAtomicCounters = 8; resource.maxAtomicCounterBindings = 1; resource.maxVertexAtomicCounterBuffers = 0; resource.maxTessControlAtomicCounterBuffers = 0; resource.maxTessEvaluationAtomicCounterBuffers = 0; resource.maxGeometryAtomicCounterBuffers = 0; resource.maxFragmentAtomicCounterBuffers = 1; resource.maxCombinedAtomicCounterBuffers = 1; resource.maxAtomicCounterBufferSize = 16384; resource.maxTransformFeedbackBuffers = 4; resource.maxTransformFeedbackInterleavedComponents = 64; resource.maxCullDistances = 8; resource.maxCombinedClipAndCullDistances = 8; resource.maxSamples = 4; resource.maxMeshOutputVerticesNV = 256; resource.maxMeshOutputPrimitivesNV = 512; resource.maxMeshWorkGroupSizeX_NV = 32; resource.maxMeshWorkGroupSizeY_NV = 1; resource.maxMeshWorkGroupSizeZ_NV = 1; resource.maxTaskWorkGroupSizeX_NV = 32; resource.maxTaskWorkGroupSizeY_NV = 1; resource.maxTaskWorkGroupSizeZ_NV = 1; resource.maxMeshViewCountNV = 4; // TODO compile-time glslang version check // resource.maxDualSourceDrawBuffersEXT = 1; resource.limits.nonInductiveForLoops = 1; resource.limits.whileLoops = 1; resource.limits.doWhileLoops = 1; resource.limits.generalUniformIndexing = 1; resource.limits.generalAttributeMatrixVectorIndexing = 1; resource.limits.generalVaryingIndexing = 1; resource.limits.generalSamplerIndexing = 1; resource.limits.generalVariableIndexing = 1; resource.limits.generalConstantMatrixVectorIndexing = 1; return resource; } class VulkanShaderIncluder : public glslang::TShader::Includer { public: virtual glslang::TShader::Includer::IncludeResult* includeLocal(const char* headerName, const char* /*includerName*/, size_t /*inclusionDepth*/) { if (strcmp(headerName, "vulkan_activation.comp") == 0) { const char* const headerData = vulkan_activation_comp_data; const size_t headerLength = sizeof(vulkan_activation_comp_data); glslang::TShader::Includer::IncludeResult* r = new glslang::TShader::Includer::IncludeResult(headerName, headerData, headerLength, 0); return r; } return 0; } virtual void releaseInclude(glslang::TShader::Includer::IncludeResult* r) { delete r; } }; class DefinitionCollector { public: template void append(const char* key, T def) { definitions.push_back(std::make_pair(key, def)); } public: struct typed_value { typed_value(const char* _s) : type(0), s(_s) { } typed_value(uint8_t _u8) : type(1), u8(_u8) { } typed_value(uint32_t _u32) : type(2), u32(_u32) { } typed_value(int32_t _i32) : type(3), i32(_i32) { } typed_value(uint64_t _u64) : type(4), u64(_u64) { } typed_value(float _f32) : type(5), f32(_f32) { } int type; union { const char* s; uint8_t u8; uint32_t u32; int32_t i32; uint64_t u64; float f32; }; }; std::vector > definitions; }; int compile_spirv_module(const char* comp_string, const Option& opt, std::vector& spirv) { // -1 for omitting the tail '\0' int length = strlen(comp_string) - 1; return compile_spirv_module(comp_string, length, opt, spirv); } int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector& spirv) { DefinitionCollector custom_defines; DefinitionCollector device_defines; if (opt.use_fp16_storage) { custom_defines.append("sfp", "float16_t"); custom_defines.append("sfpvec2", "f16vec2"); custom_defines.append("sfpvec4", "f16vec4"); if (opt.use_fp16_arithmetic) { custom_defines.append("sfpvec8", "f16mat2x4"); custom_defines.append("sfpmat4", "f16mat4"); } } else if (opt.use_fp16_packed) { custom_defines.append("sfp", "uint"); custom_defines.append("sfpvec2", "uint"); custom_defines.append("sfpvec4", "uvec2"); custom_defines.append("sfpvec8", "uvec4"); } else { custom_defines.append("sfp", "float"); custom_defines.append("sfpvec2", "vec2"); custom_defines.append("sfpvec4", "vec4"); custom_defines.append("sfpvec8", "mat2x4"); custom_defines.append("sfpmat4", "mat4"); } if (opt.use_fp16_arithmetic) { custom_defines.append("afp", "float16_t"); custom_defines.append("afpvec2", "f16vec2"); custom_defines.append("afpvec4", "f16vec4"); custom_defines.append("afpvec8", "f16mat2x4"); custom_defines.append("afpmat4", "f16mat4"); } else { custom_defines.append("afp", "float"); custom_defines.append("afpvec2", "vec2"); custom_defines.append("afpvec4", "vec4"); custom_defines.append("afpvec8", "mat2x4"); custom_defines.append("afpmat4", "mat4"); } if (opt.use_fp16_storage && opt.use_fp16_uniform && opt.use_fp16_arithmetic) { custom_defines.append("lfp", "float16_t"); custom_defines.append("lfpvec4", "f16vec4"); } else if (opt.use_fp16_storage && opt.use_fp16_arithmetic) { custom_defines.append("lfp", "float"); custom_defines.append("lfpvec4", "uint64_t"); } else if (opt.use_fp16_storage || opt.use_fp16_packed) { custom_defines.append("lfp", "float"); custom_defines.append("lfpvec4", "uvec2"); } else { custom_defines.append("lfp", "float"); custom_defines.append("lfpvec4", "vec4"); } if (opt.use_fp16_storage && opt.use_fp16_uniform && opt.use_fp16_arithmetic) { custom_defines.append("sfp2lfp(v)", "v"); custom_defines.append("sfp2lfpvec4(v)", "v"); custom_defines.append("lfp2afp(v)", "v"); custom_defines.append("lfp2afpvec4(v)", "v"); } else if (opt.use_fp16_storage && opt.use_fp16_arithmetic) { custom_defines.append("sfp2lfp(v)", "float(v)"); custom_defines.append("sfp2lfpvec4(v)", "pack64(halfBitsToUInt16(v))"); custom_defines.append("lfp2afp(v)", "float16_t(v)"); custom_defines.append("lfp2afpvec4(v)", "int16BitsToHalf(unpack16(v))"); } else if (opt.use_fp16_packed && opt.use_fp16_arithmetic) { custom_defines.append("sfp2lfp(v)", "v"); custom_defines.append("sfp2lfpvec4(v)", "v"); custom_defines.append("lfp2afp(v)", "float16_t(v)"); custom_defines.append("lfp2afpvec4(v)", "f16vec4(unpackFloat2x16(v.x),unpackFloat2x16(v.y))"); } else if (opt.use_fp16_storage) { custom_defines.append("sfp2lfp(v)", "float(v)"); custom_defines.append("sfp2lfpvec4(v)", "uvec2(packHalf2x16(vec4(v).rg),packHalf2x16(vec4(v).ba))"); custom_defines.append("lfp2afp(v)", "v"); custom_defines.append("lfp2afpvec4(v)", "vec4(unpackHalf2x16(v.x),unpackHalf2x16(v.y))"); } else if (opt.use_fp16_packed) { custom_defines.append("sfp2lfp(v)", "v"); custom_defines.append("sfp2lfpvec4(v)", "v"); custom_defines.append("lfp2afp(v)", "v"); custom_defines.append("lfp2afpvec4(v)", "vec4(unpackHalf2x16(v.x),unpackHalf2x16(v.y))"); } else { custom_defines.append("sfp2lfp(v)", "v"); custom_defines.append("sfp2lfpvec4(v)", "v"); custom_defines.append("lfp2afp(v)", "v"); custom_defines.append("lfp2afpvec4(v)", "v"); } if (opt.use_fp16_storage && opt.use_fp16_arithmetic) { custom_defines.append("buffer_ld1(buf,i)", "buf[i]"); custom_defines.append("buffer_st1(buf,i,v)", "{buf[i]=v;}"); custom_defines.append("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"); custom_defines.append("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=f16mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"); custom_defines.append("buffer_ld2(buf,i)", "buf[i]"); custom_defines.append("buffer_st2(buf,i,v)", "{buf[i]=v;}"); custom_defines.append("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("buffer_ld4(buf,i)", "buf[i]"); custom_defines.append("buffer_st4(buf,i,v)", "{buf[i]=v;}"); custom_defines.append("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"); custom_defines.append("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=f16mat2x4(sbuf[si2.r],sbuf[si2.g]);}"); custom_defines.append("buffer_ld8(buf,i)", "buf[i]"); custom_defines.append("buffer_st8(buf,i,v)", "{buf[i]=v;}"); custom_defines.append("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{f16mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"); custom_defines.append("buffer_cp8to4(buf,i2,sbuf,si)", "{f16mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"); custom_defines.append("sfp2afpmat4(v)", "v"); custom_defines.append("afp2sfpmat4(v)", "v"); } else if (opt.use_fp16_packed && opt.use_fp16_arithmetic) { // custom_defines.append("buffer_ld1(buf,i)", "float16_t(buf[i])"); custom_defines.append("buffer_ld1(buf,i)", "float16_t(unpackHalf2x16(buf[(i)/2])[(i)%2])"); // custom_defines.append("buffer_st1(buf,i,v)", "{buf[i]=float(v);}"); custom_defines.append("buffer_st1(buf,i,v)", "{uint _i=uint(i);uint _id2=_i/2;uint _im2=_i%2;float _vs=float(v);uint _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id2],0,0);vec2 _v=unpackHalf2x16(_old_v);_v[_im2]=_vs;_new_v=packHalf2x16(_v);} while(atomicCompSwap(buf[_id2],_old_v,_new_v)!=_old_v);}"); // custom_defines.append("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("buffer_cp1(buf,i,sbuf,si)", "{uint _i=uint(i);uint _id2=_i/2;uint _im2=_i%2;uint _si=uint(si);uint _sid2=_si/2;uint _sim2=_si%2;float v=unpackHalf2x16(sbuf[_sid2])[_sim2];uint _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id2],0,0);vec2 _v=unpackHalf2x16(_old_v);_v[_im2]=v;_new_v=packHalf2x16(_v);} while(atomicCompSwap(buf[_id2],_old_v,_new_v)!=_old_v);}"); // custom_defines.append("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=uvec2(packFloat2x16(f16vec2(sbuf[si4.r],sbuf[si4.g])),packFloat2x16(f16vec2(sbuf[si4.b],sbuf[si4.a])));}"); custom_defines.append("buffer_cp1to4(buf,i,sbuf,si4)", "{uvec4 _si4d2=uvec4(si4)/2;uvec4 _si4m2=uvec4(si4)%2; buf[i]=uvec2(packHalf2x16(vec2(unpackHalf2x16(sbuf[_si4d2.r])[_si4m2.r],unpackHalf2x16(sbuf[_si4d2.g])[_si4m2.g])),packHalf2x16(vec2(unpackHalf2x16(sbuf[_si4d2.b])[_si4m2.b],unpackHalf2x16(sbuf[_si4d2.a])[_si4m2.a])));}"); // custom_defines.append("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=uvec4(packFloat2x16(f16vec2(sbuf[si4.r],sbuf[si4.g])),packFloat2x16(f16vec2(sbuf[si4.b],sbuf[si4.a])),packFloat2x16(f16vec2(sbuf[sii4.r],sbuf[sii4.g])),packFloat2x16(f16vec2(sbuf[sii4.b],sbuf[sii4.a])));}"); custom_defines.append("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{uvec4 _si4d2=uvec4(si4)/2;uvec4 _sii4d2=uvec4(sii4)/2;uvec4 _si4m2=uvec4(si4)%2;uvec4 _sii4m2=uvec4(sii4)%2; buf[i]=uvec4(packHalf2x16(vec2(unpackHalf2x16(sbuf[_si4d2.r])[_si4m2.r],unpackHalf2x16(sbuf[_si4d2.g])[_si4m2.g])),packHalf2x16(vec2(unpackHalf2x16(sbuf[_si4d2.b])[_si4m2.b],unpackHalf2x16(sbuf[_si4d2.a])[_si4m2.a])),packHalf2x16(vec2(unpackHalf2x16(sbuf[_sii4d2.r])[_sii4m2.r],unpackHalf2x16(sbuf[_sii4d2.g])[_sii4m2.g])),packHalf2x16(vec2(unpackHalf2x16(sbuf[_sii4d2.b])[_sii4m2.b],unpackHalf2x16(sbuf[_sii4d2.a])[_sii4m2.a])));}"); custom_defines.append("buffer_ld2(buf,i)", "unpackFloat2x16(buf[i])"); custom_defines.append("buffer_st2(buf,i,v)", "{buf[i]=packFloat2x16(v)}"); custom_defines.append("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("buffer_ld4(buf,i)", "f16vec4(unpackFloat2x16(buf[i].x),unpackFloat2x16(buf[i].y))"); custom_defines.append("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packFloat2x16(v.rg),packFloat2x16(v.ba));}"); custom_defines.append("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); // custom_defines.append("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si]; f16vec2 _v0=unpackFloat2x16(_v.x);f16vec2 _v1=unpackFloat2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"); custom_defines.append("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y);buffer_st1(buf,i4.r,_v0.r);buffer_st1(buf,i4.g,_v0.g);buffer_st1(buf,i4.b,_v1.r);buffer_st1(buf,i4.a,_v1.g);}"); custom_defines.append("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"); custom_defines.append("buffer_ld8(buf,i)", "f16mat2x4(f16vec4(unpackFloat2x16(buf[i].r),unpackFloat2x16(buf[i].g)),f16vec4(unpackFloat2x16(buf[i].b),unpackFloat2x16(buf[i].a)))"); custom_defines.append("buffer_st8(buf,i,v)", "{buf[i]=uvec4(uvec2(packFloat2x16(v[0].rg),packFloat2x16(v[0].ba)),uvec2(packFloat2x16(v[1].rg),packFloat2x16(v[1].ba)));}"); custom_defines.append("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); // custom_defines.append("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{uvec4 _v=sbuf[si]; f16vec2 _v0=unpackFloat2x16(_v.r);f16vec2 _v1=unpackFloat2x16(_v.g);f16vec2 _v2=unpackFloat2x16(_v.b);f16vec2 _v3=unpackFloat2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"); custom_defines.append("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a);buffer_st1(buf,i4.r,_v0.r);buffer_st1(buf,i4.g,_v0.g);buffer_st1(buf,i4.b,_v1.r);buffer_st1(buf,i4.a,_v1.g);buffer_st1(buf,ii4.r,_v2.r);buffer_st1(buf,ii4.g,_v2.g);buffer_st1(buf,ii4.b,_v3.r);buffer_st1(buf,ii4.a,_v3.g);}"); custom_defines.append("buffer_cp8to4(buf,i2,sbuf,si)", "{uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"); } else if (opt.use_fp16_storage) { custom_defines.append("buffer_ld1(buf,i)", "float(buf[i])"); custom_defines.append("buffer_st1(buf,i,v)", "{buf[i]=float16_t(v);}"); custom_defines.append("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}"); custom_defines.append("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i].abcd.r=sbuf[si4.r];buf[i].abcd.g=sbuf[si4.g];buf[i].abcd.b=sbuf[si4.b];buf[i].abcd.a=sbuf[si4.a];buf[i].efgh.r=sbuf[sii4.r];buf[i].efgh.g=sbuf[sii4.g];buf[i].efgh.b=sbuf[sii4.b];buf[i].efgh.a=sbuf[sii4.a];}"); custom_defines.append("buffer_ld2(buf,i)", "vec2(buf[i])"); custom_defines.append("buffer_st2(buf,i,v)", "{buf[i]=f16vec2(v);}"); custom_defines.append("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("buffer_ld4(buf,i)", "vec4(buf[i])"); custom_defines.append("buffer_st4(buf,i,v)", "{buf[i]=f16vec4(v);}"); custom_defines.append("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"); custom_defines.append("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i].abcd=sbuf[si2.r];buf[i].efgh=sbuf[si2.g];}"); custom_defines.append("buffer_ld8(buf,i)", "mat2x4(vec4(buf[i].abcd),vec4(buf[i].efgh))"); custom_defines.append("buffer_st8(buf,i,v)", "{buf[i].abcd=f16vec4(v[0]);buf[i].efgh=f16vec4(v[1]);}"); custom_defines.append("buffer_cp8(buf,i,sbuf,si)", "{buf[i].abcd=sbuf[si].abcd;buf[i].efgh=sbuf[si].efgh;}"); custom_defines.append("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{buf[i4.r]=sbuf[si].abcd.r;buf[i4.g]=sbuf[si].abcd.g;buf[i4.b]=sbuf[si].abcd.b;buf[i4.a]=sbuf[si].abcd.a; buf[ii4.r]=sbuf[si].efgh.r;buf[ii4.g]=sbuf[si].efgh.g;buf[ii4.b]=sbuf[si].efgh.b;buf[ii4.a]=sbuf[si].efgh.a;}"); custom_defines.append("buffer_cp8to4(buf,i2,sbuf,si)", "{buf[i2.r]=sbuf[si].abcd;buf[i2.g]=sbuf[si].efgh;}"); } else if (opt.use_fp16_packed) { // custom_defines.append("buffer_ld1(buf,i)", "buf[i]"); custom_defines.append("buffer_ld1(buf,i)", "unpackHalf2x16(buf[(i)/2])[(i)%2]"); // custom_defines.append("buffer_st1(buf,i,v)", "{buf[i]=v;}"); custom_defines.append("buffer_st1(buf,i,v)", "{uint _i=uint(i);uint _id2=_i/2;uint _im2=_i%2;float _vs=float(v);uint _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id2],0,0);vec2 _v=unpackHalf2x16(_old_v);_v[_im2]=_vs;_new_v=packHalf2x16(_v);} while(atomicCompSwap(buf[_id2],_old_v,_new_v)!=_old_v);}"); // custom_defines.append("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("buffer_cp1(buf,i,sbuf,si)", "{uint _i=uint(i);uint _id2=_i/2;uint _im2=_i%2;uint _si=uint(si);uint _sid2=_si/2;uint _sim2=_si%2;float v=unpackHalf2x16(sbuf[_sid2])[_sim2];uint _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id2],0,0);vec2 _v=unpackHalf2x16(_old_v);_v[_im2]=v;_new_v=packHalf2x16(_v);} while(atomicCompSwap(buf[_id2],_old_v,_new_v)!=_old_v);}"); // custom_defines.append("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=uvec2(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])));}"); custom_defines.append("buffer_cp1to4(buf,i,sbuf,si4)", "{uvec4 _si4d2=uvec4(si4)/2;uvec4 _si4m2=uvec4(si4)%2; buf[i]=uvec2(packHalf2x16(vec2(unpackHalf2x16(sbuf[_si4d2.r])[_si4m2.r],unpackHalf2x16(sbuf[_si4d2.g])[_si4m2.g])),packHalf2x16(vec2(unpackHalf2x16(sbuf[_si4d2.b])[_si4m2.b],unpackHalf2x16(sbuf[_si4d2.a])[_si4m2.a])));}"); // custom_defines.append("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=uvec4(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])),packHalf2x16(vec2(sbuf[sii4.r],sbuf[sii4.g])),packHalf2x16(vec2(sbuf[sii4.b],sbuf[sii4.a])));}"); custom_defines.append("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{uvec4 _si4d2=uvec4(si4)/2;uvec4 _sii4d2=uvec4(sii4)/2;uvec4 _si4m2=uvec4(si4)%2;uvec4 _sii4m2=uvec4(sii4)%2; buf[i]=uvec4(packHalf2x16(vec2(unpackHalf2x16(sbuf[_si4d2.r])[_si4m2.r],unpackHalf2x16(sbuf[_si4d2.g])[_si4m2.g])),packHalf2x16(vec2(unpackHalf2x16(sbuf[_si4d2.b])[_si4m2.b],unpackHalf2x16(sbuf[_si4d2.a])[_si4m2.a])),packHalf2x16(vec2(unpackHalf2x16(sbuf[_sii4d2.r])[_sii4m2.r],unpackHalf2x16(sbuf[_sii4d2.g])[_sii4m2.g])),packHalf2x16(vec2(unpackHalf2x16(sbuf[_sii4d2.b])[_sii4m2.b],unpackHalf2x16(sbuf[_sii4d2.a])[_sii4m2.a])));}"); custom_defines.append("buffer_ld2(buf,i)", "unpackHalf2x16(buf[i])"); custom_defines.append("buffer_st2(buf,i,v)", "{buf[i]=packHalf2x16(v)}"); custom_defines.append("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("buffer_ld4(buf,i)", "vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))"); custom_defines.append("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}"); custom_defines.append("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); // custom_defines.append("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"); custom_defines.append("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y);buffer_st1(buf,i4.r,_v0.r);buffer_st1(buf,i4.g,_v0.g);buffer_st1(buf,i4.b,_v1.r);buffer_st1(buf,i4.a,_v1.g);}"); custom_defines.append("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"); custom_defines.append("buffer_ld8(buf,i)", "mat2x4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g)),vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a)))"); custom_defines.append("buffer_st8(buf,i,v)", "{buf[i]=uvec4(uvec2(packHalf2x16(v[0].rg),packHalf2x16(v[0].ba)),uvec2(packHalf2x16(v[1].rg),packHalf2x16(v[1].ba)));}"); custom_defines.append("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); // custom_defines.append("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"); custom_defines.append("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a);buffer_st1(buf,i4.r,_v0.r);buffer_st1(buf,i4.g,_v0.g);buffer_st1(buf,i4.b,_v1.r);buffer_st1(buf,i4.a,_v1.g);buffer_st1(buf,ii4.r,_v2.r);buffer_st1(buf,ii4.g,_v2.g);buffer_st1(buf,ii4.b,_v3.r);buffer_st1(buf,ii4.a,_v3.g);}"); custom_defines.append("buffer_cp8to4(buf,i2,sbuf,si)", "{uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"); } else { custom_defines.append("buffer_ld1(buf,i)", "buf[i]"); custom_defines.append("buffer_st1(buf,i,v)", "{buf[i]=v;}"); custom_defines.append("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"); custom_defines.append("buffer_cp1to8(buf,i,sbuf,si4,sii4)", "{buf[i]=mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"); custom_defines.append("buffer_ld2(buf,i)", "buf[i]"); custom_defines.append("buffer_st2(buf,i,v)", "{buf[i]=v;}"); custom_defines.append("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("buffer_ld4(buf,i)", "buf[i]"); custom_defines.append("buffer_st4(buf,i,v)", "{buf[i]=v;}"); custom_defines.append("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("buffer_cp4to1(buf,i4,sbuf,si)", "{vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}"); custom_defines.append("buffer_cp4to8(buf,i,sbuf,si2)", "{buf[i]=mat2x4(sbuf[si2.r],sbuf[si2.g]);}"); custom_defines.append("buffer_ld8(buf,i)", "buf[i]"); custom_defines.append("buffer_st8(buf,i,v)", "{buf[i]=v;}"); custom_defines.append("buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("buffer_cp8to1(buf,i4,ii4,sbuf,si)", "{mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"); custom_defines.append("buffer_cp8to4(buf,i2,sbuf,si)", "{mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"); custom_defines.append("sfp2afpmat4(v)", "v"); custom_defines.append("afp2sfpmat4(v)", "v"); } if (opt.use_int8_storage) { custom_defines.append("sint8", "int8_t"); } else if (opt.use_int8_packed) { custom_defines.append("sint8", "int"); } else { custom_defines.append("sint8", "int"); } custom_defines.append("sint8vec4", "int"); custom_defines.append("sint8vec8", "ivec2"); custom_defines.append("aint8", "int"); custom_defines.append("aint8vec4", "ivec4"); custom_defines.append("unpackInt4x8(v)", "ivec4((v<<24)>>24,(v<<16)>>24,(v<<8)>>24,v>>24)"); custom_defines.append("packInt4x8(v)", "int((uint(v.r)&0xFFu)|((uint(v.g)&0xFFu)<<8)|((uint(v.b)&0xFFu)<<16)|((uint(v.a)&0xFFu)<<24))"); if (opt.use_int8_storage) { custom_defines.append("i8buffer_ld1(buf,i)", "int(buf[i])"); custom_defines.append("i8buffer_st1(buf,i,v)", "{buf[i]=int8_t(v);}"); custom_defines.append("i8buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); } else { custom_defines.append("i8buffer_ld1(buf,i)", "int(((buf[(i)/4])<<(24-((i)%4)*8))>>24)"); custom_defines.append("i8buffer_st1(buf,i,v)", "{uint _i=uint(i);uint _id4=_i/4;uint _im4=_i%4;int _vs=int(v);int _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id4],0,0);ivec4 _v=unpackInt4x8(_old_v);_v[_im4]=_vs;_new_v=packInt4x8(_v);} while(atomicCompSwap(buf[_id4],_old_v,_new_v)!=_old_v);}"); custom_defines.append("i8buffer_cp1(buf,i,sbuf,si)", "{int _v=i8buffer_ld1(sbuf,si);i8buffer_st1(buf,i,_v);}"); } custom_defines.append("i8buffer_ld4(buf,i)", "unpackInt4x8(buf[i])"); custom_defines.append("i8buffer_st4(buf,i,v)", "{buf[i]=packInt4x8(v);}"); custom_defines.append("i8buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("i8buffer_ld8(buf,i)", "ivec8(unpackInt4x8(buf[i].r),unpackInt4x8(buf[i].g))"); custom_defines.append("i8buffer_st8(buf,i,v)", "{buf[i]=ivec2(packInt4x8(v.abcd),packInt4x8(v.efgh));}"); custom_defines.append("i8buffer_cp8(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}"); custom_defines.append("psc(x)", "(x==0?p.x:x)"); if (opt.use_fp16_storage) { custom_defines.append("NCNN_fp16_storage", 1); } else if (opt.use_fp16_packed) { custom_defines.append("NCNN_fp16_packed", 1); } if (opt.use_fp16_uniform) { custom_defines.append("NCNN_fp16_uniform", 1); } if (opt.use_fp16_arithmetic) { custom_defines.append("NCNN_fp16_arithmetic", 1); } if (opt.use_int8_storage) { custom_defines.append("NCNN_int8_storage", 1); } else if (opt.use_int8_packed) { custom_defines.append("NCNN_int8_packed", 1); } if (opt.use_int8_uniform) { custom_defines.append("NCNN_int8_uniform", 1); } if (opt.use_int8_arithmetic) { custom_defines.append("NCNN_int8_arithmetic", 1); } if (opt.use_shader_local_memory) { custom_defines.append("NCNN_shader_local_memory", 1); } #if __APPLE__ custom_defines.append("NCNN_moltenvk", 1); #endif custom_defines.append("ncnn_glsl_version", 1); bool support_shader_int64 = false; // fill device macros { int device_index = opt.vulkan_device_index; if (device_index < 0 || device_index >= get_gpu_count()) device_index = get_default_gpu_index(); const GpuInfo& info = get_gpu_info(device_index); support_shader_int64 = info.physicalDevicefeatures().shaderInt64; // pull in device extensions { const std::vector& properties = info.deviceExtensionProperties(); for (size_t i = 0; i < properties.size(); i++) { const VkExtensionProperties& exp = properties[i]; device_defines.append(exp.extensionName, exp.specVersion); } } #define DD_APPEND_FEATURE(X) device_defines.append(#X, features.X ? 1 : 0); // pull in device features macros { const VkPhysicalDeviceFeatures& features = info.physicalDevicefeatures(); DD_APPEND_FEATURE(robustBufferAccess) DD_APPEND_FEATURE(fullDrawIndexUint32) DD_APPEND_FEATURE(imageCubeArray) DD_APPEND_FEATURE(independentBlend) DD_APPEND_FEATURE(geometryShader) DD_APPEND_FEATURE(tessellationShader) DD_APPEND_FEATURE(sampleRateShading) DD_APPEND_FEATURE(dualSrcBlend) DD_APPEND_FEATURE(logicOp) DD_APPEND_FEATURE(multiDrawIndirect) DD_APPEND_FEATURE(drawIndirectFirstInstance) DD_APPEND_FEATURE(depthClamp) DD_APPEND_FEATURE(depthBiasClamp) DD_APPEND_FEATURE(fillModeNonSolid) DD_APPEND_FEATURE(depthBounds) DD_APPEND_FEATURE(wideLines) DD_APPEND_FEATURE(largePoints) DD_APPEND_FEATURE(alphaToOne) DD_APPEND_FEATURE(multiViewport) DD_APPEND_FEATURE(samplerAnisotropy) DD_APPEND_FEATURE(textureCompressionETC2) DD_APPEND_FEATURE(textureCompressionASTC_LDR) DD_APPEND_FEATURE(textureCompressionBC) DD_APPEND_FEATURE(occlusionQueryPrecise) DD_APPEND_FEATURE(pipelineStatisticsQuery) DD_APPEND_FEATURE(vertexPipelineStoresAndAtomics) DD_APPEND_FEATURE(fragmentStoresAndAtomics) DD_APPEND_FEATURE(shaderTessellationAndGeometryPointSize) DD_APPEND_FEATURE(shaderImageGatherExtended) DD_APPEND_FEATURE(shaderStorageImageExtendedFormats) DD_APPEND_FEATURE(shaderStorageImageMultisample) DD_APPEND_FEATURE(shaderStorageImageReadWithoutFormat) DD_APPEND_FEATURE(shaderStorageImageWriteWithoutFormat) DD_APPEND_FEATURE(shaderUniformBufferArrayDynamicIndexing) DD_APPEND_FEATURE(shaderSampledImageArrayDynamicIndexing) DD_APPEND_FEATURE(shaderStorageBufferArrayDynamicIndexing) DD_APPEND_FEATURE(shaderStorageImageArrayDynamicIndexing) DD_APPEND_FEATURE(shaderClipDistance) DD_APPEND_FEATURE(shaderCullDistance) DD_APPEND_FEATURE(shaderFloat64) DD_APPEND_FEATURE(shaderInt64) DD_APPEND_FEATURE(shaderInt16) DD_APPEND_FEATURE(shaderResourceResidency) DD_APPEND_FEATURE(shaderResourceMinLod) DD_APPEND_FEATURE(sparseBinding) DD_APPEND_FEATURE(sparseResidencyBuffer) DD_APPEND_FEATURE(sparseResidencyImage2D) DD_APPEND_FEATURE(sparseResidencyImage3D) DD_APPEND_FEATURE(sparseResidency2Samples) DD_APPEND_FEATURE(sparseResidency4Samples) DD_APPEND_FEATURE(sparseResidency8Samples) DD_APPEND_FEATURE(sparseResidency16Samples) DD_APPEND_FEATURE(sparseResidencyAliased) DD_APPEND_FEATURE(variableMultisampleRate) DD_APPEND_FEATURE(inheritedQueries) } if (info.support_VK_KHR_8bit_storage()) { const VkPhysicalDevice8BitStorageFeaturesKHR& features = info.query8BitStorageFeatures(); DD_APPEND_FEATURE(storageBuffer8BitAccess) DD_APPEND_FEATURE(uniformAndStorageBuffer8BitAccess) DD_APPEND_FEATURE(storagePushConstant8) } if (info.support_VK_KHR_16bit_storage()) { const VkPhysicalDevice16BitStorageFeaturesKHR& features = info.query16BitStorageFeatures(); DD_APPEND_FEATURE(storageBuffer16BitAccess) DD_APPEND_FEATURE(uniformAndStorageBuffer16BitAccess) DD_APPEND_FEATURE(storagePushConstant16) DD_APPEND_FEATURE(storageInputOutput16) } if (info.support_VK_KHR_shader_float16_int8()) { const VkPhysicalDeviceFloat16Int8FeaturesKHR& features = info.queryFloat16Int8Features(); DD_APPEND_FEATURE(shaderFloat16) DD_APPEND_FEATURE(shaderInt8) } if (info.support_VK_KHR_sampler_ycbcr_conversion()) { const VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR& features = info.querySamplerYcbcrConversionFeatures(); DD_APPEND_FEATURE(samplerYcbcrConversion) } if (info.support_VK_KHR_cooperative_matrix()) { const VkPhysicalDeviceCooperativeMatrixFeaturesKHR& features = info.queryCooperativeMatrixFeatures(); DD_APPEND_FEATURE(cooperativeMatrix) DD_APPEND_FEATURE(cooperativeMatrixRobustBufferAccess) } else if (info.support_VK_NV_cooperative_matrix()) { const VkPhysicalDeviceCooperativeMatrixFeaturesNV& features = info.queryCooperativeMatrixFeaturesNV(); DD_APPEND_FEATURE(cooperativeMatrix) DD_APPEND_FEATURE(cooperativeMatrixRobustBufferAccess) } if (info.support_VK_NV_cooperative_matrix2()) { const VkPhysicalDeviceCooperativeMatrix2FeaturesNV& features = info.queryCooperativeMatrix2FeaturesNV(); DD_APPEND_FEATURE(cooperativeMatrixWorkgroupScope) DD_APPEND_FEATURE(cooperativeMatrixFlexibleDimensions) DD_APPEND_FEATURE(cooperativeMatrixReductions) DD_APPEND_FEATURE(cooperativeMatrixConversions) DD_APPEND_FEATURE(cooperativeMatrixPerElementOperations) DD_APPEND_FEATURE(cooperativeMatrixTensorAddressing) DD_APPEND_FEATURE(cooperativeMatrixBlockLoads) } if (info.support_VK_NV_cooperative_vector()) { const VkPhysicalDeviceCooperativeVectorFeaturesNV& features = info.queryCooperativeVectorFeaturesNV(); DD_APPEND_FEATURE(cooperativeVector) DD_APPEND_FEATURE(cooperativeVectorTraining) } if (info.support_VK_EXT_subgroup_size_control()) { const VkPhysicalDeviceSubgroupSizeControlFeaturesEXT& features = info.querySubgroupSizeControlFeatures(); DD_APPEND_FEATURE(subgroupSizeControl) DD_APPEND_FEATURE(computeFullSubgroups) } if (info.support_VK_KHR_shader_bfloat16()) { const VkPhysicalDeviceShaderBfloat16FeaturesKHR& features = info.queryShaderBfloat16Features(); DD_APPEND_FEATURE(shaderBFloat16Type) DD_APPEND_FEATURE(shaderBFloat16DotProduct) DD_APPEND_FEATURE(shaderBFloat16CooperativeMatrix) } if (info.support_VK_EXT_shader_float8()) { const VkPhysicalDeviceShaderFloat8FeaturesEXT& features = info.queryShaderFloat8Features(); DD_APPEND_FEATURE(shaderFloat8) DD_APPEND_FEATURE(shaderFloat8CooperativeMatrix) } if (info.support_VK_KHR_shader_float_controls2()) { const VkPhysicalDeviceShaderFloatControls2FeaturesKHR& features = info.queryShaderFloatControls2Features(); DD_APPEND_FEATURE(shaderFloatControls2) } if (info.support_VK_KHR_shader_integer_dot_product()) { const VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR& features = info.queryShaderIntegerDotProductFeatures(); DD_APPEND_FEATURE(shaderIntegerDotProduct) } if (info.support_VK_KHR_shader_subgroup_rotate()) { const VkPhysicalDeviceShaderSubgroupRotateFeaturesKHR& features = info.queryShaderSubgroupRotateFeatures(); DD_APPEND_FEATURE(shaderSubgroupRotate) DD_APPEND_FEATURE(shaderSubgroupRotateClustered) } if (info.support_VK_EXT_shader_atomic_float()) { const VkPhysicalDeviceShaderAtomicFloatFeaturesEXT& features = info.queryShaderAtomicFloatFeatures(); DD_APPEND_FEATURE(shaderBufferFloat32Atomics) DD_APPEND_FEATURE(shaderBufferFloat32AtomicAdd) DD_APPEND_FEATURE(shaderBufferFloat64Atomics) DD_APPEND_FEATURE(shaderBufferFloat64AtomicAdd) DD_APPEND_FEATURE(shaderSharedFloat32Atomics) DD_APPEND_FEATURE(shaderSharedFloat32AtomicAdd) DD_APPEND_FEATURE(shaderSharedFloat64Atomics) DD_APPEND_FEATURE(shaderSharedFloat64AtomicAdd) DD_APPEND_FEATURE(shaderImageFloat32Atomics) DD_APPEND_FEATURE(shaderImageFloat32AtomicAdd) DD_APPEND_FEATURE(sparseImageFloat32Atomics) DD_APPEND_FEATURE(sparseImageFloat32AtomicAdd) } if (info.support_VK_EXT_shader_atomic_float2()) { const VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT& features = info.queryShaderAtomicFloat2Features(); DD_APPEND_FEATURE(shaderBufferFloat16Atomics) DD_APPEND_FEATURE(shaderBufferFloat16AtomicAdd) DD_APPEND_FEATURE(shaderBufferFloat16AtomicMinMax) DD_APPEND_FEATURE(shaderBufferFloat32AtomicMinMax) DD_APPEND_FEATURE(shaderBufferFloat64AtomicMinMax) DD_APPEND_FEATURE(shaderSharedFloat16Atomics) DD_APPEND_FEATURE(shaderSharedFloat16AtomicAdd) DD_APPEND_FEATURE(shaderSharedFloat16AtomicMinMax) DD_APPEND_FEATURE(shaderSharedFloat32AtomicMinMax) DD_APPEND_FEATURE(shaderSharedFloat64AtomicMinMax) DD_APPEND_FEATURE(shaderImageFloat32AtomicMinMax) DD_APPEND_FEATURE(sparseImageFloat32AtomicMinMax) } if (info.support_VK_KHR_vulkan_memory_model()) { const VkPhysicalDeviceVulkanMemoryModelFeaturesKHR& features = info.queryVulkanMemoryModelFeatures(); DD_APPEND_FEATURE(vulkanMemoryModel) DD_APPEND_FEATURE(vulkanMemoryModelDeviceScope) DD_APPEND_FEATURE(vulkanMemoryModelAvailabilityVisibilityChains) } #undef DD_APPEND_FEATURE #define DD_APPEND_PROPERTY(X) device_defines.append(#X, properties.X); // pull in device properties macros { const VkPhysicalDeviceProperties& properties = info.physicalDeviceProperties(); DD_APPEND_PROPERTY(apiVersion) DD_APPEND_PROPERTY(driverVersion) DD_APPEND_PROPERTY(vendorID) DD_APPEND_PROPERTY(deviceID) DD_APPEND_PROPERTY(deviceType) // DD_APPEND_PROPERTY(deviceName) // DD_APPEND_PROPERTY(pipelineCacheUUID) #define DD_APPEND_PROPERTY_LIMIT(X) device_defines.append(#X, properties.limits.X); #define DD_APPEND_PROPERTY_LIMIT_2(X) \ device_defines.append(#X "_0", properties.limits.X[0]); \ device_defines.append(#X "_1", properties.limits.X[1]); #define DD_APPEND_PROPERTY_LIMIT_3(X) \ device_defines.append(#X "_0", properties.limits.X[0]); \ device_defines.append(#X "_1", properties.limits.X[1]); \ device_defines.append(#X "_2", properties.limits.X[2]); DD_APPEND_PROPERTY_LIMIT(maxImageDimension1D) DD_APPEND_PROPERTY_LIMIT(maxImageDimension2D) DD_APPEND_PROPERTY_LIMIT(maxImageDimension3D) DD_APPEND_PROPERTY_LIMIT(maxImageDimensionCube) DD_APPEND_PROPERTY_LIMIT(maxImageArrayLayers) DD_APPEND_PROPERTY_LIMIT(maxTexelBufferElements) DD_APPEND_PROPERTY_LIMIT(maxUniformBufferRange) DD_APPEND_PROPERTY_LIMIT(maxStorageBufferRange) DD_APPEND_PROPERTY_LIMIT(maxPushConstantsSize) DD_APPEND_PROPERTY_LIMIT(maxMemoryAllocationCount) DD_APPEND_PROPERTY_LIMIT(maxSamplerAllocationCount) DD_APPEND_PROPERTY_LIMIT(bufferImageGranularity) DD_APPEND_PROPERTY_LIMIT(sparseAddressSpaceSize) DD_APPEND_PROPERTY_LIMIT(maxBoundDescriptorSets) DD_APPEND_PROPERTY_LIMIT(maxPerStageDescriptorSamplers) DD_APPEND_PROPERTY_LIMIT(maxPerStageDescriptorUniformBuffers) DD_APPEND_PROPERTY_LIMIT(maxPerStageDescriptorStorageBuffers) DD_APPEND_PROPERTY_LIMIT(maxPerStageDescriptorSampledImages) DD_APPEND_PROPERTY_LIMIT(maxPerStageDescriptorStorageImages) DD_APPEND_PROPERTY_LIMIT(maxPerStageDescriptorInputAttachments) DD_APPEND_PROPERTY_LIMIT(maxPerStageResources) DD_APPEND_PROPERTY_LIMIT(maxDescriptorSetSamplers) DD_APPEND_PROPERTY_LIMIT(maxDescriptorSetUniformBuffers) DD_APPEND_PROPERTY_LIMIT(maxDescriptorSetUniformBuffersDynamic) DD_APPEND_PROPERTY_LIMIT(maxDescriptorSetStorageBuffers) DD_APPEND_PROPERTY_LIMIT(maxDescriptorSetStorageBuffersDynamic) DD_APPEND_PROPERTY_LIMIT(maxDescriptorSetSampledImages) DD_APPEND_PROPERTY_LIMIT(maxDescriptorSetStorageImages) DD_APPEND_PROPERTY_LIMIT(maxDescriptorSetInputAttachments) DD_APPEND_PROPERTY_LIMIT(maxVertexInputAttributes) DD_APPEND_PROPERTY_LIMIT(maxVertexInputBindings) DD_APPEND_PROPERTY_LIMIT(maxVertexInputAttributeOffset) DD_APPEND_PROPERTY_LIMIT(maxVertexInputBindingStride) DD_APPEND_PROPERTY_LIMIT(maxVertexOutputComponents) DD_APPEND_PROPERTY_LIMIT(maxTessellationGenerationLevel) DD_APPEND_PROPERTY_LIMIT(maxTessellationPatchSize) DD_APPEND_PROPERTY_LIMIT(maxTessellationControlPerVertexInputComponents) DD_APPEND_PROPERTY_LIMIT(maxTessellationControlPerVertexOutputComponents) DD_APPEND_PROPERTY_LIMIT(maxTessellationControlPerPatchOutputComponents) DD_APPEND_PROPERTY_LIMIT(maxTessellationControlTotalOutputComponents) DD_APPEND_PROPERTY_LIMIT(maxTessellationEvaluationInputComponents) DD_APPEND_PROPERTY_LIMIT(maxTessellationEvaluationOutputComponents) DD_APPEND_PROPERTY_LIMIT(maxGeometryShaderInvocations) DD_APPEND_PROPERTY_LIMIT(maxGeometryInputComponents) DD_APPEND_PROPERTY_LIMIT(maxGeometryOutputComponents) DD_APPEND_PROPERTY_LIMIT(maxGeometryOutputVertices) DD_APPEND_PROPERTY_LIMIT(maxGeometryTotalOutputComponents) DD_APPEND_PROPERTY_LIMIT(maxFragmentInputComponents) DD_APPEND_PROPERTY_LIMIT(maxFragmentOutputAttachments) DD_APPEND_PROPERTY_LIMIT(maxFragmentDualSrcAttachments) DD_APPEND_PROPERTY_LIMIT(maxFragmentCombinedOutputResources) DD_APPEND_PROPERTY_LIMIT(maxComputeSharedMemorySize) DD_APPEND_PROPERTY_LIMIT_3(maxComputeWorkGroupCount) DD_APPEND_PROPERTY_LIMIT(maxComputeWorkGroupInvocations) DD_APPEND_PROPERTY_LIMIT_3(maxComputeWorkGroupSize) DD_APPEND_PROPERTY_LIMIT(subPixelPrecisionBits) DD_APPEND_PROPERTY_LIMIT(subTexelPrecisionBits) DD_APPEND_PROPERTY_LIMIT(mipmapPrecisionBits) DD_APPEND_PROPERTY_LIMIT(maxDrawIndexedIndexValue) DD_APPEND_PROPERTY_LIMIT(maxDrawIndirectCount) DD_APPEND_PROPERTY_LIMIT(maxSamplerLodBias) DD_APPEND_PROPERTY_LIMIT(maxSamplerAnisotropy) DD_APPEND_PROPERTY_LIMIT(maxViewports) DD_APPEND_PROPERTY_LIMIT_2(maxViewportDimensions) DD_APPEND_PROPERTY_LIMIT_2(viewportBoundsRange) DD_APPEND_PROPERTY_LIMIT(viewportSubPixelBits) device_defines.append("minMemoryMapAlignment", (uint32_t)properties.limits.minMemoryMapAlignment); DD_APPEND_PROPERTY_LIMIT(minTexelBufferOffsetAlignment) DD_APPEND_PROPERTY_LIMIT(minUniformBufferOffsetAlignment) DD_APPEND_PROPERTY_LIMIT(minStorageBufferOffsetAlignment) DD_APPEND_PROPERTY_LIMIT(minTexelOffset) DD_APPEND_PROPERTY_LIMIT(maxTexelOffset) DD_APPEND_PROPERTY_LIMIT(minTexelGatherOffset) DD_APPEND_PROPERTY_LIMIT(maxTexelGatherOffset) DD_APPEND_PROPERTY_LIMIT(minInterpolationOffset) DD_APPEND_PROPERTY_LIMIT(maxInterpolationOffset) DD_APPEND_PROPERTY_LIMIT(subPixelInterpolationOffsetBits) DD_APPEND_PROPERTY_LIMIT(maxFramebufferWidth) DD_APPEND_PROPERTY_LIMIT(maxFramebufferHeight) DD_APPEND_PROPERTY_LIMIT(maxFramebufferLayers) DD_APPEND_PROPERTY_LIMIT(framebufferColorSampleCounts) DD_APPEND_PROPERTY_LIMIT(framebufferDepthSampleCounts) DD_APPEND_PROPERTY_LIMIT(framebufferStencilSampleCounts) DD_APPEND_PROPERTY_LIMIT(framebufferNoAttachmentsSampleCounts) DD_APPEND_PROPERTY_LIMIT(maxColorAttachments) DD_APPEND_PROPERTY_LIMIT(sampledImageColorSampleCounts) DD_APPEND_PROPERTY_LIMIT(sampledImageIntegerSampleCounts) DD_APPEND_PROPERTY_LIMIT(sampledImageDepthSampleCounts) DD_APPEND_PROPERTY_LIMIT(sampledImageStencilSampleCounts) DD_APPEND_PROPERTY_LIMIT(storageImageSampleCounts) DD_APPEND_PROPERTY_LIMIT(maxSampleMaskWords) DD_APPEND_PROPERTY_LIMIT(timestampComputeAndGraphics) DD_APPEND_PROPERTY_LIMIT(timestampPeriod) DD_APPEND_PROPERTY_LIMIT(maxClipDistances) DD_APPEND_PROPERTY_LIMIT(maxCullDistances) DD_APPEND_PROPERTY_LIMIT(maxCombinedClipAndCullDistances) DD_APPEND_PROPERTY_LIMIT(discreteQueuePriorities) DD_APPEND_PROPERTY_LIMIT_2(pointSizeRange) DD_APPEND_PROPERTY_LIMIT_2(lineWidthRange) DD_APPEND_PROPERTY_LIMIT(pointSizeGranularity) DD_APPEND_PROPERTY_LIMIT(lineWidthGranularity) DD_APPEND_PROPERTY_LIMIT(strictLines) DD_APPEND_PROPERTY_LIMIT(standardSampleLocations) DD_APPEND_PROPERTY_LIMIT(optimalBufferCopyOffsetAlignment) DD_APPEND_PROPERTY_LIMIT(optimalBufferCopyRowPitchAlignment) DD_APPEND_PROPERTY_LIMIT(nonCoherentAtomSize) #undef DD_APPEND_PROPERTY_LIMIT #undef DD_APPEND_PROPERTY_LIMIT_2 #undef DD_APPEND_PROPERTY_LIMIT_3 #define DD_APPEND_PROPERTY_SPARSE(X) device_defines.append(#X, properties.sparseProperties.X); DD_APPEND_PROPERTY_SPARSE(residencyStandard2DBlockShape) DD_APPEND_PROPERTY_SPARSE(residencyStandard2DMultisampleBlockShape) DD_APPEND_PROPERTY_SPARSE(residencyStandard3DBlockShape) DD_APPEND_PROPERTY_SPARSE(residencyAlignedMipSize) DD_APPEND_PROPERTY_SPARSE(residencyNonResidentStrict) #undef DD_APPEND_PROPERTY_SPARSE } { const VkPhysicalDeviceSubgroupProperties& properties = info.querySubgroupProperties(); DD_APPEND_PROPERTY(subgroupSize) DD_APPEND_PROPERTY(supportedStages) DD_APPEND_PROPERTY(supportedOperations) DD_APPEND_PROPERTY(quadOperationsInAllStages) // append subgroup ops device_defines.append("subgroup_basic", (properties.supportedOperations & VK_SUBGROUP_FEATURE_BASIC_BIT) ? 1 : 0); device_defines.append("subgroup_vote", (properties.supportedOperations & VK_SUBGROUP_FEATURE_VOTE_BIT) ? 1 : 0); device_defines.append("subgroup_arithmetic", (properties.supportedOperations & VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) ? 1 : 0); device_defines.append("subgroup_ballot", (properties.supportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT) ? 1 : 0); device_defines.append("subgroup_shuffle", (properties.supportedOperations & VK_SUBGROUP_FEATURE_SHUFFLE_BIT) ? 1 : 0); device_defines.append("subgroup_shuffle_relative", (properties.supportedOperations & VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) ? 1 : 0); device_defines.append("subgroup_clustered", (properties.supportedOperations & VK_SUBGROUP_FEATURE_CLUSTERED_BIT) ? 1 : 0); device_defines.append("subgroup_quad", (properties.supportedOperations & VK_SUBGROUP_FEATURE_QUAD_BIT) ? 1 : 0); device_defines.append("subgroup_rotate", (properties.supportedOperations & VK_SUBGROUP_FEATURE_ROTATE_BIT) ? 1 : 0); device_defines.append("subgroup_rotate_relative", (properties.supportedOperations & VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT) ? 1 : 0); device_defines.append("subgroup_partitioned", (properties.supportedOperations & VK_SUBGROUP_FEATURE_PARTITIONED_BIT_NV) ? 1 : 0); } if (info.support_VK_NV_cooperative_matrix2()) { const VkPhysicalDeviceCooperativeMatrix2PropertiesNV& properties = info.queryCooperativeMatrix2PropertiesNV(); DD_APPEND_PROPERTY(cooperativeMatrixWorkgroupScopeMaxWorkgroupSize) DD_APPEND_PROPERTY(cooperativeMatrixFlexibleDimensionsMaxDimension) DD_APPEND_PROPERTY(cooperativeMatrixWorkgroupScopeReservedSharedMemory) } if (info.support_VK_NV_cooperative_vector()) { const VkPhysicalDeviceCooperativeVectorPropertiesNV& properties = info.queryCooperativeVectorPropertiesNV(); DD_APPEND_PROPERTY(cooperativeVectorSupportedStages) DD_APPEND_PROPERTY(cooperativeVectorTrainingFloat16Accumulation) DD_APPEND_PROPERTY(cooperativeVectorTrainingFloat32Accumulation) DD_APPEND_PROPERTY(maxCooperativeVectorComponents) } if (info.support_VK_KHR_driver_properties()) { const VkPhysicalDeviceDriverPropertiesKHR& properties = info.queryDriverProperties(); DD_APPEND_PROPERTY(driverID) // DD_APPEND_PROPERTY(driverName) // DD_APPEND_PROPERTY(driverInfo) device_defines.append("conformanceVersion_major", properties.conformanceVersion.major); device_defines.append("conformanceVersion_minor", properties.conformanceVersion.minor); device_defines.append("conformanceVersion_subminor", properties.conformanceVersion.subminor); device_defines.append("conformanceVersion_patch", properties.conformanceVersion.patch); } if (info.support_VK_KHR_shader_integer_dot_product()) { const VkPhysicalDeviceShaderIntegerDotProductProperties& properties = info.queryShaderIntegerDotProductProperties(); DD_APPEND_PROPERTY(integerDotProduct8BitUnsignedAccelerated) DD_APPEND_PROPERTY(integerDotProduct8BitSignedAccelerated) DD_APPEND_PROPERTY(integerDotProduct8BitMixedSignednessAccelerated) DD_APPEND_PROPERTY(integerDotProduct4x8BitPackedUnsignedAccelerated) DD_APPEND_PROPERTY(integerDotProduct4x8BitPackedSignedAccelerated) DD_APPEND_PROPERTY(integerDotProduct4x8BitPackedMixedSignednessAccelerated) DD_APPEND_PROPERTY(integerDotProduct16BitUnsignedAccelerated) DD_APPEND_PROPERTY(integerDotProduct16BitSignedAccelerated) DD_APPEND_PROPERTY(integerDotProduct16BitMixedSignednessAccelerated) DD_APPEND_PROPERTY(integerDotProduct32BitUnsignedAccelerated) DD_APPEND_PROPERTY(integerDotProduct32BitSignedAccelerated) DD_APPEND_PROPERTY(integerDotProduct32BitMixedSignednessAccelerated) DD_APPEND_PROPERTY(integerDotProduct64BitUnsignedAccelerated) DD_APPEND_PROPERTY(integerDotProduct64BitSignedAccelerated) DD_APPEND_PROPERTY(integerDotProduct64BitMixedSignednessAccelerated) DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating8BitUnsignedAccelerated) DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating8BitSignedAccelerated) DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated) DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated) DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated) DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated) DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating16BitUnsignedAccelerated) DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating16BitSignedAccelerated) DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated) DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating32BitUnsignedAccelerated) DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating32BitSignedAccelerated) DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated) DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating64BitUnsignedAccelerated) DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating64BitSignedAccelerated) DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated) } if (info.support_VK_EXT_subgroup_size_control()) { const VkPhysicalDeviceSubgroupSizeControlPropertiesEXT& properties = info.querySubgroupSizeControlProperties(); DD_APPEND_PROPERTY(minSubgroupSize) DD_APPEND_PROPERTY(maxSubgroupSize) DD_APPEND_PROPERTY(maxComputeWorkgroupSubgroups) DD_APPEND_PROPERTY(requiredSubgroupSizeStages) } #if ENABLE_VALIDATION_LAYER if (info.support_VK_KHR_shader_non_semantic_info()) { device_defines.append("enable_validation_layer", VK_TRUE); custom_defines.append("NCNN_LOGE", "debugPrintfEXT"); } #endif #undef DD_APPEND_PROPERTY } std::string define_macro_data; for (size_t i = 0; i < custom_defines.definitions.size(); i++) { const char* key = custom_defines.definitions[i].first; const DefinitionCollector::typed_value& def = custom_defines.definitions[i].second; if (def.type == 0) { define_macro_data += std::string("#define ") + key + " " + def.s + "\n"; } else { char defstr[256]; if (def.type == 1) { sprintf(defstr, "%u", def.u8); } if (def.type == 2) { sprintf(defstr, "%u", def.u32); } if (def.type == 3) { sprintf(defstr, "%d", def.i32); } if (def.type == 4) { if (support_shader_int64) { sprintf(defstr, "%luull", def.u64); } else { uint32_t u32 = def.u64 > UINT_MAX ? UINT_MAX : (uint32_t)def.u64; sprintf(defstr, "%u", u32); } } if (def.type == 5) { sprintf(defstr, "%e", def.f32); } define_macro_data += std::string("#define ") + key + " " + defstr + "\n"; } } for (size_t i = 0; i < device_defines.definitions.size(); i++) { const char* key = device_defines.definitions[i].first; const DefinitionCollector::typed_value& def = device_defines.definitions[i].second; if (def.type == 0) { define_macro_data += std::string("#define ncnn_") + key + " \"" + def.s + "\"\n"; } else { char defstr[256]; if (def.type == 1) { sprintf(defstr, "%u", def.u8); } if (def.type == 2) { sprintf(defstr, "%u", def.u32); } if (def.type == 3) { sprintf(defstr, "%d", def.i32); } if (def.type == 4) { if (support_shader_int64) { sprintf(defstr, "%luull", def.u64); } else { uint32_t u32 = def.u64 > UINT_MAX ? UINT_MAX : (uint32_t)def.u64; sprintf(defstr, "%u", u32); } } if (def.type == 5) { sprintf(defstr, "%e", def.f32); } define_macro_data += std::string("#define ncnn_") + key + " " + defstr + "\n"; } } // enable extensions std::string custom_exts; if (support_shader_int64) { custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_int64: require\n"; } if (opt.use_fp16_storage) { custom_exts += "#extension GL_EXT_shader_16bit_storage: require\n"; custom_exts += "struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };\n"; } if (opt.use_fp16_arithmetic) { custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_float16: require\n"; } custom_exts += "struct ivec8 { ivec4 abcd; ivec4 efgh; };\n"; if (opt.use_int8_storage) { custom_exts += "#extension GL_EXT_shader_8bit_storage: require\n"; } if (opt.use_int8_arithmetic) { custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_int8: require\n"; } #if ENABLE_VALIDATION_LAYER { custom_exts += "#extension GL_EXT_debug_printf : require\n"; } #endif // debug // NCNN_LOGE("%s", define_macro_data.c_str()); bool compile_success = true; { glslang::TShader s(EShLangCompute); // split shader source by token "#version 450\n" int version_end_pos = -1; { for (int i = 0; i < comp_data_size - 8; i++) { if (strncmp(comp_data + i, "#version", 8) != 0) continue; // #version shall be the very beginning or after newline if (i != 0 && comp_data[i - 1] != '\n') continue; int nversion = 0; sscanf(comp_data + i, "#version %*d\n%n", &nversion); if (nversion == 0) continue; version_end_pos = i + nversion; break; } if (version_end_pos == -1) { NCNN_LOGE("shader source has no #version token"); return -1; } // NCNN_LOGE("version_end_pos = %d", version_end_pos); } const char* comp_data_2 = comp_data + version_end_pos; int comp_data_size_1 = version_end_pos; int comp_data_size_2 = comp_data_size - comp_data_size_1; const char* comp_datas[4] = {comp_data, custom_exts.c_str(), define_macro_data.c_str(), comp_data_2}; const int comp_data_sizes[4] = {comp_data_size_1, (int)custom_exts.size(), (int)define_macro_data.size(), comp_data_size_2}; s.setStringsWithLengths(comp_datas, comp_data_sizes, 4); s.setEntryPoint("main"); s.setSourceEntryPoint("main"); s.setEnvInput(glslang::EShSourceGlsl, EShLangCompute, glslang::EShClientVulkan, 1); if (opt.use_subgroup_ops || opt.use_cooperative_matrix) { // subgroup / cooperative_matrix need vulkan-1.1 and spirv-1.3 s.setEnvClient(glslang::EShClientVulkan, glslang::EShTargetVulkan_1_1); s.setEnvTarget(glslang::EshTargetSpv, glslang::EShTargetSpv_1_3); } else { s.setEnvClient(glslang::EShClientVulkan, glslang::EShTargetVulkan_1_0); s.setEnvTarget(glslang::EshTargetSpv, glslang::EShTargetSpv_1_0); } TBuiltInResource resources = get_default_TBuiltInResource(); VulkanShaderIncluder includer; bool pr = s.parse(&resources, 100, ENoProfile, false, false, EShMsgDefault, includer); if (!pr) { NCNN_LOGE("compile spir-v module failed"); NCNN_LOGE("%s", s.getInfoLog()); NCNN_LOGE("%s", s.getInfoDebugLog()); // print as line_number: code { const char* p = comp_datas[3]; const char* line_end; int line_number = 1; while ((line_end = strchr(p, '\n')) != NULL) { NCNN_LOGE("%d:\t%.*s", line_number++, (int)(line_end - p), p); p = line_end + 1; } if (*p != '\0') { NCNN_LOGE("%d:\t%s", line_number, p); } } compile_success = false; } else { glslang::TIntermediate* ir = s.getIntermediate(); glslang::GlslangToSpv(*ir, spirv); } } return compile_success ? 0 : -1; } int compile_spirv_module(int shader_type_index, const Option& opt, std::vector& spirv) { if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count) { NCNN_LOGE("no such shader module %d", shader_type_index); return -1; } const char* comp_data = layer_shader_registry[shader_type_index].comp_data; int comp_data_size = layer_shader_registry[shader_type_index].comp_data_size; return compile_spirv_module(comp_data, comp_data_size, opt, spirv); } int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info) { shader_info.specialization_count = 0; shader_info.binding_count = 0; shader_info.push_constant_count = 0; uint32_t parameter_id = -233; int specialization_count = 0; int binding_count = 0; int push_constant_count = 0; // id -> binding_type std::vector id_types; // binding_id -> binding_type std::vector binding_types; const uint32_t* p = spv_data; int bound = p[3]; id_types.resize(bound); // skip magic version generator bound schema p += 5; // foreach op while ((const unsigned char*)p < (const unsigned char*)spv_data + spv_data_size) { uint32_t opcode = p[0]; uint16_t wordcount = opcode >> 16; uint16_t op = opcode & 0xffff; if (op == 5) // OpName { uint32_t id = p[1]; const char* name = (const char*)&p[2]; if (strcmp(name, "parameter") == 0) { parameter_id = id; } } else if (op == 6) // OpMemberName { uint32_t id = p[1]; if (id == parameter_id) { push_constant_count++; } } else if (op == 25) // OpTypeImage { uint32_t id = p[1]; id_types[id] = 2; } else if (op == 27) // OpTypeSampledImage { uint32_t id = p[1]; id_types[id] = 3; } else if (op == 32) // OpTypePointer { uint32_t id = p[1]; uint32_t storage_class = p[2]; uint32_t type = p[3]; if (storage_class == 0) // UniformConstant { id_types[id] = id_types[type]; } if (storage_class == 2) // Uniform { id_types[id] = id_types[type]; } if (storage_class == 12) // StorageBuffer { id_types[type] = 1; id_types[id] = id_types[type]; } } else if (op == 59) // OpVariable { uint32_t id = p[1]; uint32_t var_id = p[2]; uint32_t storage_class = p[3]; if (storage_class == 0) // UniformConstant { id_types[var_id] = id_types[id]; } if (storage_class == 2) // Uniform { id_types[var_id] = id_types[id]; } if (storage_class == 12) // StorageBuffer { id_types[var_id] = id_types[id]; } } else if (op == 71) // OpDecorate { uint32_t id = p[1]; uint32_t decoration = p[2]; uint32_t binding_id = p[3]; if (decoration == 1) // SpecId { specialization_count++; } if (decoration == 3) // BufferBlock { id_types[id] = 1; } else if (decoration == 33) // Binding { binding_count = std::max(binding_count, (int)binding_id + 1); binding_types.resize(binding_count); binding_types[binding_id] = id; } } p += wordcount; } if (binding_count > 16) { NCNN_LOGE("too many binding %d", binding_count); return -1; } shader_info.specialization_count = specialization_count; shader_info.binding_count = binding_count; shader_info.push_constant_count = push_constant_count; // resolve binding_types for (int i = 0; i < binding_count; i++) { shader_info.binding_types[i] = id_types[binding_types[i]]; } return 0; } } // namespace ncnn #else namespace ncnn { int create_gpu_instance(const char* driver_path) { return 0; } void destroy_gpu_instance() { } int get_gpu_count() { return 0; } } // namespace ncnn #endif // NCNN_VULKAN