// Tencent is pleased to support the open source community by making ncnn available. // // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at // // https://opensource.org/licenses/BSD-3-Clause // // Unless required by applicable law or agreed to in writing, software distributed // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. #include "gpu.h" #if NCNN_VULKAN #include #include #include #include #include #include #include #include "mat.h" #if __ANDROID__ #define ENABLE_VALIDATION_LAYER 0 #else #define ENABLE_VALIDATION_LAYER 0 #endif namespace ncnn { // global static VkInstance g_instance = 0; static int g_gpu_count = 0; static int g_default_gpu_index = -1; // NOTE 8 is large enough i think ... #define NCNN_MAX_GPU_COUNT 8 static GpuInfo g_gpu_infos[NCNN_MAX_GPU_COUNT]; // default vulkan device static Mutex g_default_vkdev_lock; static VulkanDevice* g_default_vkdev[NCNN_MAX_GPU_COUNT] = {0}; // precompiled spirv struct layer_shader_registry_entry { const uint32_t* spv_data; size_t spv_data_size; }; #include "layer_shader_spv_data.h" static const layer_shader_registry_entry layer_shader_registry[] = { #include "layer_shader_registry.h" }; static ShaderInfo layer_shader_infos[sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry)]; static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry); int support_VK_KHR_external_memory_capabilities = 0; int support_VK_KHR_get_physical_device_properties2 = 0; int support_VK_KHR_get_surface_capabilities2 = 0; int support_VK_KHR_surface = 0; int support_VK_EXT_debug_utils = 0; #if __ANDROID_API__ >= 26 int support_VK_KHR_android_surface = 0; #endif // __ANDROID_API__ >= 26 // VK_KHR_external_memory_capabilities PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR = 0; // VK_KHR_get_physical_device_properties2 PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR = 0; PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR = 0; PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR = 0; PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR = 0; PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR = 0; PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR = 0; PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR = 0; // VK_KHR_get_surface_capabilities2 PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR = 0; PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR = 0; // VK_KHR_surface PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR = 0; PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR = 0; PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR = 0; PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR = 0; PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR = 0; #if __ANDROID_API__ >= 26 // VK_KHR_android_surface PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR = 0; #endif // __ANDROID_API__ >= 26 // compile with old vulkan sdk #if VK_HEADER_VERSION < 80 #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR (VkStructureType)1000177000 typedef struct VkPhysicalDevice8BitStorageFeaturesKHR { VkStructureType sType; void* pNext; VkBool32 storageBuffer8BitAccess; VkBool32 uniformAndStorageBuffer8BitAccess; VkBool32 storagePushConstant8; } VkPhysicalDevice8BitStorageFeaturesKHR; #endif // VK_HEADER_VERSION < 80 #if VK_HEADER_VERSION < 95 #define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR (VkStructureType)1000082000 typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR { VkStructureType sType; void* pNext; VkBool32 shaderFloat16; VkBool32 shaderInt8; } VkPhysicalDeviceFloat16Int8FeaturesKHR; #endif // VK_HEADER_VERSION < 95 static int init_instance_extension() { if (support_VK_KHR_external_memory_capabilities) { vkGetPhysicalDeviceExternalBufferPropertiesKHR = (PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceExternalBufferPropertiesKHR"); } if (support_VK_KHR_get_physical_device_properties2) { vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFeatures2KHR"); vkGetPhysicalDeviceProperties2KHR = (PFN_vkGetPhysicalDeviceProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceProperties2KHR"); vkGetPhysicalDeviceFormatProperties2KHR = (PFN_vkGetPhysicalDeviceFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFormatProperties2KHR"); vkGetPhysicalDeviceImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceImageFormatProperties2KHR"); vkGetPhysicalDeviceQueueFamilyProperties2KHR = (PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceQueueFamilyProperties2KHR"); vkGetPhysicalDeviceMemoryProperties2KHR = (PFN_vkGetPhysicalDeviceMemoryProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceMemoryProperties2KHR"); vkGetPhysicalDeviceSparseImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSparseImageFormatProperties2KHR"); } if (support_VK_KHR_get_surface_capabilities2) { vkGetPhysicalDeviceSurfaceCapabilities2KHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilities2KHR");; vkGetPhysicalDeviceSurfaceFormats2KHR = (PFN_vkGetPhysicalDeviceSurfaceFormats2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormats2KHR");; } if (support_VK_KHR_surface) { vkDestroySurfaceKHR = (PFN_vkDestroySurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkDestroySurfaceKHR");; vkGetPhysicalDeviceSurfaceSupportKHR = (PFN_vkGetPhysicalDeviceSurfaceSupportKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceSupportKHR");; vkGetPhysicalDeviceSurfaceCapabilitiesKHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilitiesKHR");; vkGetPhysicalDeviceSurfaceFormatsKHR = (PFN_vkGetPhysicalDeviceSurfaceFormatsKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormatsKHR");; vkGetPhysicalDeviceSurfacePresentModesKHR = (PFN_vkGetPhysicalDeviceSurfacePresentModesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfacePresentModesKHR");; } #if __ANDROID_API__ >= 26 if (support_VK_KHR_android_surface) { vkCreateAndroidSurfaceKHR = (PFN_vkCreateAndroidSurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkCreateAndroidSurfaceKHR"); } #endif // __ANDROID_API__ >= 26 return 0; } #if ENABLE_VALIDATION_LAYER static VkDebugUtilsMessengerEXT callback; static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback( VkDebugUtilsMessageSeverityFlagBitsEXT /*messageSeverity*/, VkDebugUtilsMessageTypeFlagsEXT /*messageType*/, const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData, void* /*pUserData*/) { fprintf(stderr, "validation layer: %s\n", pCallbackData->pMessage); return VK_FALSE; } VkResult CreateDebugUtilsMessengerEXT(VkInstance instance, const VkDebugUtilsMessengerCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugUtilsMessengerEXT* pCallback) { PFN_vkCreateDebugUtilsMessengerEXT func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkCreateDebugUtilsMessengerEXT"); if (func) return func(instance, pCreateInfo, pAllocator, pCallback); return VK_ERROR_EXTENSION_NOT_PRESENT; } void DestroyDebugUtilsMessengerEXT(VkInstance instance, VkDebugUtilsMessengerEXT callback, const VkAllocationCallbacks* pAllocator) { PFN_vkDestroyDebugUtilsMessengerEXT func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkDestroyDebugUtilsMessengerEXT"); if (func) func(instance, callback, pAllocator); } #endif // ENABLE_VALIDATION_LAYER static uint32_t find_device_compute_queue(const std::vector& queueFamilyProperties) { // first try, compute only queue for (uint32_t i=0; i& queueFamilyProperties) { // first try, graphics only queue for (uint32_t i=0; i& queueFamilyProperties) { // first try, transfer only queue for (uint32_t i=0; i 0) return 0; fprintf(stderr, "no vulkan device\n"); return -1; } int create_gpu_instance() { VkResult ret; std::vector enabledLayers; #if ENABLE_VALIDATION_LAYER uint32_t instanceLayerPropertyCount; ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, NULL); if (ret != VK_SUCCESS) { fprintf(stderr, "vkEnumerateInstanceLayerProperties failed %d\n", ret); return -1; } std::vector instanceLayerProperties(instanceLayerPropertyCount); ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, instanceLayerProperties.data()); if (ret != VK_SUCCESS) { fprintf(stderr, "vkEnumerateInstanceLayerProperties failed %d\n", ret); return -1; } for (uint32_t i=0; i enabledExtensions; uint32_t instanceExtensionPropertyCount; ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, NULL); if (ret != VK_SUCCESS) { fprintf(stderr, "vkEnumerateInstanceExtensionProperties failed %d\n", ret); return -1; } std::vector instanceExtensionProperties(instanceExtensionPropertyCount); ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, instanceExtensionProperties.data()); if (ret != VK_SUCCESS) { fprintf(stderr, "vkEnumerateInstanceExtensionProperties failed %d\n", ret); return -1; } support_VK_KHR_get_physical_device_properties2 = 0; support_VK_KHR_get_surface_capabilities2 = 0; support_VK_KHR_surface = 0; support_VK_EXT_debug_utils = 0; #if __ANDROID_API__ >= 26 support_VK_KHR_android_surface = 0; #endif // __ANDROID_API__ >= 26 for (uint32_t j=0; j= 26 else if (strcmp(exp.extensionName, "VK_KHR_android_surface") == 0) support_VK_KHR_android_surface = exp.specVersion; #endif // __ANDROID_API__ >= 26 } if (support_VK_KHR_external_memory_capabilities) enabledExtensions.push_back("VK_KHR_external_memory_capabilities"); if (support_VK_KHR_get_physical_device_properties2) enabledExtensions.push_back("VK_KHR_get_physical_device_properties2"); if (support_VK_KHR_get_surface_capabilities2) enabledExtensions.push_back("VK_KHR_get_surface_capabilities2"); if (support_VK_KHR_surface) enabledExtensions.push_back("VK_KHR_surface"); #if ENABLE_VALIDATION_LAYER if (support_VK_EXT_debug_utils) enabledExtensions.push_back("VK_EXT_debug_utils"); #endif // ENABLE_VALIDATION_LAYER #if __ANDROID_API__ >= 26 if (support_VK_KHR_android_surface) enabledExtensions.push_back("VK_KHR_android_surface"); #endif // __ANDROID_API__ >= 26 VkApplicationInfo applicationInfo; applicationInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; applicationInfo.pNext = 0; applicationInfo.pApplicationName = "ncnn"; applicationInfo.applicationVersion = 0; applicationInfo.pEngineName = "ncnn"; applicationInfo.engineVersion = 20200222; applicationInfo.apiVersion = VK_MAKE_VERSION(1, 0, 0); VkInstanceCreateInfo instanceCreateInfo; instanceCreateInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; instanceCreateInfo.pNext = 0; instanceCreateInfo.flags = 0; instanceCreateInfo.pApplicationInfo = &applicationInfo; instanceCreateInfo.enabledLayerCount = enabledLayers.size(); instanceCreateInfo.ppEnabledLayerNames = enabledLayers.data(); instanceCreateInfo.enabledExtensionCount = enabledExtensions.size(); instanceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data(); ret = vkCreateInstance(&instanceCreateInfo, 0, &g_instance); if (ret != VK_SUCCESS) { fprintf(stderr, "vkCreateInstance failed %d\n", ret); return -1; } #if ENABLE_VALIDATION_LAYER if (support_VK_EXT_debug_utils) { VkDebugUtilsMessengerCreateInfoEXT createInfo = {}; createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT; createInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT; createInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; createInfo.pfnUserCallback = debugCallback; createInfo.pUserData = 0; ret = CreateDebugUtilsMessengerEXT(g_instance, &createInfo, NULL, &callback); if (ret != VK_SUCCESS) { fprintf(stderr, "CreateDebugUtilsMessengerEXT failed %d\n", ret); return -1; } } #endif // ENABLE_VALIDATION_LAYER init_instance_extension(); uint32_t physicalDeviceCount = 0; ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, 0); if (ret != VK_SUCCESS) { fprintf(stderr, "vkEnumeratePhysicalDevices failed %d\n", ret); return -1; } if (physicalDeviceCount > NCNN_MAX_GPU_COUNT) physicalDeviceCount = NCNN_MAX_GPU_COUNT; std::vector physicalDevices(physicalDeviceCount); ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, physicalDevices.data()); if (ret != VK_SUCCESS) { fprintf(stderr, "vkEnumeratePhysicalDevices failed %d\n", ret); return -1; } // find proper device and queue int gpu_info_index = 0; for (uint32_t i=0; i queueFamilyProperties(queueFamilyPropertiesCount); vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, queueFamilyProperties.data()); gpu_info.compute_queue_family_index = find_device_compute_queue(queueFamilyProperties); gpu_info.graphics_queue_family_index = find_device_graphics_queue(queueFamilyProperties); gpu_info.transfer_queue_family_index = find_device_transfer_queue(queueFamilyProperties); gpu_info.compute_queue_count = queueFamilyProperties[gpu_info.compute_queue_family_index].queueCount; gpu_info.graphics_queue_count = queueFamilyProperties[gpu_info.graphics_queue_family_index].queueCount; gpu_info.transfer_queue_count = queueFamilyProperties[gpu_info.transfer_queue_family_index].queueCount; // cache memory properties vkGetPhysicalDeviceMemoryProperties(physicalDevice, &gpu_info.physicalDeviceMemoryProperties); // get device extension uint32_t deviceExtensionPropertyCount = 0; ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, NULL); if (ret != VK_SUCCESS) { fprintf(stderr, "vkEnumerateDeviceExtensionProperties failed %d\n", ret); return -1; } std::vector deviceExtensionProperties(deviceExtensionPropertyCount); ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, deviceExtensionProperties.data()); if (ret != VK_SUCCESS) { fprintf(stderr, "vkEnumerateDeviceExtensionProperties failed %d\n", ret); return -1; } // extension capability gpu_info.support_VK_KHR_8bit_storage = 0; gpu_info.support_VK_KHR_16bit_storage = 0; gpu_info.support_VK_KHR_bind_memory2 = 0; gpu_info.support_VK_KHR_dedicated_allocation = 0; gpu_info.support_VK_KHR_descriptor_update_template = 0; gpu_info.support_VK_KHR_external_memory = 0; gpu_info.support_VK_KHR_get_memory_requirements2 = 0; gpu_info.support_VK_KHR_maintenance1 = 0; gpu_info.support_VK_KHR_push_descriptor = 0; gpu_info.support_VK_KHR_sampler_ycbcr_conversion = 0; gpu_info.support_VK_KHR_shader_float16_int8 = 0; gpu_info.support_VK_KHR_shader_float_controls = 0; gpu_info.support_VK_KHR_storage_buffer_storage_class = 0; gpu_info.support_VK_KHR_swapchain = 0; gpu_info.support_VK_EXT_queue_family_foreign = 0; #if __ANDROID_API__ >= 26 gpu_info.support_VK_ANDROID_external_memory_android_hardware_buffer = 0; #endif // __ANDROID_API__ >= 26 for (uint32_t j=0; j= 26 else if (strcmp(exp.extensionName, "VK_ANDROID_external_memory_android_hardware_buffer") == 0) gpu_info.support_VK_ANDROID_external_memory_android_hardware_buffer = exp.specVersion; #endif // __ANDROID_API__ >= 26 } // check features gpu_info.support_fp16_packed = true; gpu_info.support_fp16_storage = false; gpu_info.support_fp16_arithmetic = false; gpu_info.support_int8_storage = false; gpu_info.support_int8_arithmetic = false; gpu_info.support_ycbcr_conversion = false; if (support_VK_KHR_get_physical_device_properties2) { void* queryExtensionFeatures = 0; // query int8 storage VkPhysicalDevice8BitStorageFeaturesKHR query8BitStorageFeatures; query8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR; query8BitStorageFeatures.pNext = 0; if (gpu_info.support_VK_KHR_8bit_storage) { query8BitStorageFeatures.pNext = queryExtensionFeatures; queryExtensionFeatures = &query8BitStorageFeatures; } // query fp16/int16 storage VkPhysicalDevice16BitStorageFeaturesKHR query16BitStorageFeatures; query16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR; query16BitStorageFeatures.pNext = 0; if (gpu_info.support_VK_KHR_16bit_storage) { query16BitStorageFeatures.pNext = queryExtensionFeatures; queryExtensionFeatures = &query16BitStorageFeatures; } // query fp16/int8 arithmetic VkPhysicalDeviceFloat16Int8FeaturesKHR queryFloat16Int8Features; queryFloat16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR; queryFloat16Int8Features.pNext = 0; if (gpu_info.support_VK_KHR_shader_float16_int8) { queryFloat16Int8Features.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryFloat16Int8Features; } // query ycbcr_conversion VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures; querySamplerYcbcrConversionFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES_KHR; querySamplerYcbcrConversionFeatures.pNext = 0; if (gpu_info.support_VK_KHR_sampler_ycbcr_conversion) { querySamplerYcbcrConversionFeatures.pNext = queryExtensionFeatures; queryExtensionFeatures = &querySamplerYcbcrConversionFeatures; } VkPhysicalDeviceFeatures2KHR queryFeatures; queryFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR, queryFeatures.pNext = queryExtensionFeatures; vkGetPhysicalDeviceFeatures2KHR(physicalDevice, &queryFeatures); if (gpu_info.support_VK_KHR_8bit_storage) { gpu_info.support_int8_storage = query8BitStorageFeatures.storageBuffer8BitAccess && query8BitStorageFeatures.uniformAndStorageBuffer8BitAccess; } if (gpu_info.support_VK_KHR_16bit_storage) { gpu_info.support_fp16_storage = query16BitStorageFeatures.storageBuffer16BitAccess && query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess; } if (gpu_info.support_VK_KHR_shader_float16_int8) { gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16; gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8; } if (gpu_info.support_VK_KHR_sampler_ycbcr_conversion) { gpu_info.support_ycbcr_conversion = querySamplerYcbcrConversionFeatures.samplerYcbcrConversion; } } else { // // TODO // VkPhysicalDeviceFeatures features; // vkGetPhysicalDeviceFeatures(physicalDevice, &features); } if (physicalDeviceProperties.vendorID == 0x13b5) { // the 16bit_storage implementation of arm mali driver is buggy :[ gpu_info.support_fp16_storage = false; } if (physicalDeviceProperties.vendorID == 0x10002 && physicalDeviceProperties.deviceID == 0x70006214 && physicalDeviceProperties.apiVersion == VK_MAKE_VERSION(1, 1, 82)) { // the 16bit_storage implementation of vivante gc1700 driver is buggy :[ gpu_info.support_fp16_storage = false; } if (gpu_info.bug_implicit_fp16_arithmetic) { // force capability on as long as the driver accept spirv with fp16 arithmetic :D gpu_info.support_fp16_arithmetic = true; } fprintf(stderr, "[%u %s] queueC=%u[%u] queueG=%u[%u] queueT=%u[%u]\n", i, physicalDeviceProperties.deviceName, gpu_info.compute_queue_family_index, gpu_info.compute_queue_count, gpu_info.graphics_queue_family_index, gpu_info.graphics_queue_count, gpu_info.transfer_queue_family_index, gpu_info.transfer_queue_count); fprintf(stderr, "[%u %s] buglssc=%d bugihfa=%d\n", i, physicalDeviceProperties.deviceName, gpu_info.bug_local_size_spec_const, gpu_info.bug_implicit_fp16_arithmetic); fprintf(stderr, "[%u %s] fp16p=%d fp16s=%d fp16a=%d int8s=%d int8a=%d\n", i, physicalDeviceProperties.deviceName, gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic, gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic); gpu_info_index++; } g_gpu_count = gpu_info_index; // the default gpu device g_default_gpu_index = find_default_vulkan_device_index(); // resolve shader info for (int i=0; i enabledExtensions; if (info.support_VK_KHR_8bit_storage) enabledExtensions.push_back("VK_KHR_8bit_storage"); if (info.support_VK_KHR_16bit_storage) enabledExtensions.push_back("VK_KHR_16bit_storage"); if (info.support_VK_KHR_bind_memory2) enabledExtensions.push_back("VK_KHR_bind_memory2"); if (info.support_VK_KHR_dedicated_allocation) enabledExtensions.push_back("VK_KHR_dedicated_allocation"); if (info.support_VK_KHR_descriptor_update_template) enabledExtensions.push_back("VK_KHR_descriptor_update_template"); if (info.support_VK_KHR_external_memory) enabledExtensions.push_back("VK_KHR_external_memory"); if (info.support_VK_KHR_get_memory_requirements2) enabledExtensions.push_back("VK_KHR_get_memory_requirements2"); if (info.support_VK_KHR_maintenance1) enabledExtensions.push_back("VK_KHR_maintenance1"); if (info.support_VK_KHR_push_descriptor) enabledExtensions.push_back("VK_KHR_push_descriptor"); if (info.support_VK_KHR_sampler_ycbcr_conversion) enabledExtensions.push_back("VK_KHR_sampler_ycbcr_conversion"); if (info.support_VK_KHR_shader_float16_int8) enabledExtensions.push_back("VK_KHR_shader_float16_int8"); if (info.support_VK_KHR_shader_float_controls) enabledExtensions.push_back("VK_KHR_shader_float_controls"); if (info.support_VK_KHR_storage_buffer_storage_class) enabledExtensions.push_back("VK_KHR_storage_buffer_storage_class"); if (info.support_VK_KHR_swapchain) enabledExtensions.push_back("VK_KHR_swapchain"); if (info.support_VK_EXT_queue_family_foreign) enabledExtensions.push_back("VK_EXT_queue_family_foreign"); #if __ANDROID_API__ >= 26 if (info.support_VK_ANDROID_external_memory_android_hardware_buffer) enabledExtensions.push_back("VK_ANDROID_external_memory_android_hardware_buffer"); #endif // __ANDROID_API__ >= 26 void* enabledExtensionFeatures = 0; // enable int8 storage VkPhysicalDevice8BitStorageFeaturesKHR enabled8BitStorageFeatures; enabled8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR; enabled8BitStorageFeatures.pNext = 0; enabled8BitStorageFeatures.storageBuffer8BitAccess = info.support_int8_storage; enabled8BitStorageFeatures.uniformAndStorageBuffer8BitAccess = info.support_int8_storage; enabled8BitStorageFeatures.storagePushConstant8 = VK_FALSE; if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_8bit_storage) { enabled8BitStorageFeatures.pNext = enabledExtensionFeatures; enabledExtensionFeatures = &enabled8BitStorageFeatures; } // enable fp16/int16 storage VkPhysicalDevice16BitStorageFeaturesKHR enabled16BitStorageFeatures; enabled16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR; enabled16BitStorageFeatures.pNext = 0; enabled16BitStorageFeatures.storageBuffer16BitAccess = info.support_fp16_storage; enabled16BitStorageFeatures.uniformAndStorageBuffer16BitAccess = info.support_fp16_storage; enabled16BitStorageFeatures.storagePushConstant16 = VK_FALSE; enabled16BitStorageFeatures.storageInputOutput16 = VK_FALSE; if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_16bit_storage) { enabled16BitStorageFeatures.pNext = enabledExtensionFeatures; enabledExtensionFeatures = &enabled16BitStorageFeatures; } // enable fp16/int8 arithmetic VkPhysicalDeviceFloat16Int8FeaturesKHR enabledFloat16Int8Features; enabledFloat16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR; enabledFloat16Int8Features.pNext = 0; enabledFloat16Int8Features.shaderFloat16 = info.support_fp16_arithmetic; enabledFloat16Int8Features.shaderInt8 = info.support_int8_arithmetic; if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_shader_float16_int8) { enabledFloat16Int8Features.pNext = enabledExtensionFeatures; enabledExtensionFeatures = &enabledFloat16Int8Features; } // enable ycbcr conversion VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures; querySamplerYcbcrConversionFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES_KHR; querySamplerYcbcrConversionFeatures.pNext = 0; querySamplerYcbcrConversionFeatures.samplerYcbcrConversion = info.support_ycbcr_conversion; if (support_VK_KHR_get_physical_device_properties2 && info.support_ycbcr_conversion) { querySamplerYcbcrConversionFeatures.pNext = enabledExtensionFeatures; enabledExtensionFeatures = &querySamplerYcbcrConversionFeatures; } std::vector compute_queue_priorities(info.compute_queue_count, 1.f);// 0.f ~ 1.f std::vector graphics_queue_priorities(info.graphics_queue_count, 1.f);// 0.f ~ 1.f std::vector transfer_queue_priorities(info.transfer_queue_count, 1.f);// 0.f ~ 1.f VkDeviceQueueCreateInfo deviceQueueCreateInfos[3]; VkDeviceQueueCreateInfo deviceComputeQueueCreateInfo; deviceComputeQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; deviceComputeQueueCreateInfo.pNext = 0; deviceComputeQueueCreateInfo.flags = 0; deviceComputeQueueCreateInfo.queueFamilyIndex = info.compute_queue_family_index; deviceComputeQueueCreateInfo.queueCount = info.compute_queue_count; deviceComputeQueueCreateInfo.pQueuePriorities = compute_queue_priorities.data(); VkDeviceQueueCreateInfo deviceGraphicsQueueCreateInfo; deviceGraphicsQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; deviceGraphicsQueueCreateInfo.pNext = 0; deviceGraphicsQueueCreateInfo.flags = 0; deviceGraphicsQueueCreateInfo.queueFamilyIndex = info.graphics_queue_family_index; deviceGraphicsQueueCreateInfo.queueCount = info.graphics_queue_count; deviceGraphicsQueueCreateInfo.pQueuePriorities = graphics_queue_priorities.data(); VkDeviceQueueCreateInfo deviceTransferQueueCreateInfo; deviceTransferQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; deviceTransferQueueCreateInfo.pNext = 0; deviceTransferQueueCreateInfo.flags = 0; deviceTransferQueueCreateInfo.queueFamilyIndex = info.transfer_queue_family_index; deviceTransferQueueCreateInfo.queueCount = info.transfer_queue_count; deviceTransferQueueCreateInfo.pQueuePriorities = transfer_queue_priorities.data(); VkDeviceCreateInfo deviceCreateInfo; deviceCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; deviceCreateInfo.pNext = enabledExtensionFeatures; deviceCreateInfo.flags = 0; if (info.compute_queue_family_index == info.graphics_queue_family_index && info.compute_queue_family_index == info.transfer_queue_family_index) { deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo; deviceCreateInfo.queueCreateInfoCount = 1; } else if (info.compute_queue_family_index == info.graphics_queue_family_index && info.compute_queue_family_index != info.transfer_queue_family_index) { deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo; deviceQueueCreateInfos[1] = deviceTransferQueueCreateInfo; deviceCreateInfo.queueCreateInfoCount = 2; } else if (info.compute_queue_family_index != info.graphics_queue_family_index && info.graphics_queue_family_index == info.transfer_queue_family_index) { deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo; deviceQueueCreateInfos[1] = deviceGraphicsQueueCreateInfo; deviceCreateInfo.queueCreateInfoCount = 2; } else // if (info.compute_queue_family_index != info.graphics_queue_family_index && info.graphics_queue_family_index != info.transfer_queue_family_index) { deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo; deviceQueueCreateInfos[1] = deviceGraphicsQueueCreateInfo; deviceQueueCreateInfos[2] = deviceTransferQueueCreateInfo; deviceCreateInfo.queueCreateInfoCount = 3; } deviceCreateInfo.pQueueCreateInfos = deviceQueueCreateInfos; deviceCreateInfo.enabledLayerCount = 0; deviceCreateInfo.ppEnabledLayerNames = 0; deviceCreateInfo.enabledExtensionCount = enabledExtensions.size(); deviceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data(); deviceCreateInfo.pEnabledFeatures = 0;// VkPhysicalDeviceFeatures pointer VkResult ret = vkCreateDevice(info.physical_device, &deviceCreateInfo, 0, &device); if (ret != VK_SUCCESS) { fprintf(stderr, "vkCreateDevice failed %d\n", ret); } init_device_extension(); create_shader_module(); compute_queues.resize(info.compute_queue_count); blob_allocators.resize(info.compute_queue_count); staging_allocators.resize(info.compute_queue_count); for (uint32_t i = 0; i < info.compute_queue_count; i++) { vkGetDeviceQueue(device, info.compute_queue_family_index, i, &compute_queues[i]); blob_allocators[i] = new VkBlobBufferAllocator(this); staging_allocators[i] = new VkStagingBufferAllocator(this); } if (info.compute_queue_family_index != info.graphics_queue_family_index) { graphics_queues.resize(info.graphics_queue_count); for (uint32_t i = 0; i < info.graphics_queue_count; i++) { vkGetDeviceQueue(device, info.graphics_queue_family_index, i, &graphics_queues[i]); } } if (info.compute_queue_family_index != info.transfer_queue_family_index && info.graphics_queue_family_index != info.transfer_queue_family_index) { transfer_queues.resize(info.transfer_queue_count); for (uint32_t i = 0; i < info.transfer_queue_count; i++) { vkGetDeviceQueue(device, info.transfer_queue_family_index, i, &transfer_queues[i]); } } } VulkanDevice::~VulkanDevice() { for (uint32_t i = 0; i < info.compute_queue_count; i++) { delete blob_allocators[i]; delete staging_allocators[i]; } blob_allocators.clear(); staging_allocators.clear(); destroy_shader_module(); vkDestroyDevice(device, 0); } VkShaderModule VulkanDevice::get_shader_module(int shader_type_index) const { if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count) { fprintf(stderr, "no such shader module %d\n", shader_type_index); return 0; } return shader_modules[shader_type_index]; } VkShaderModule VulkanDevice::create_shader_module(int shader_type_index, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const { if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count) { fprintf(stderr, "no such shader module %d\n", shader_type_index); return 0; } const uint32_t* spv_data = layer_shader_registry[shader_type_index].spv_data; size_t spv_data_size = layer_shader_registry[shader_type_index].spv_data_size; return compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z); } VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const { VkShaderModuleCreateInfo shaderModuleCreateInfo; shaderModuleCreateInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; shaderModuleCreateInfo.pNext = 0; shaderModuleCreateInfo.flags = 0; shaderModuleCreateInfo.codeSize = spv_data_size; shaderModuleCreateInfo.pCode = spv_data; VkShaderModule shader_module; VkResult ret = vkCreateShaderModule(device, &shaderModuleCreateInfo, 0, &shader_module); if (ret != VK_SUCCESS) { fprintf(stderr, "vkCreateShaderModule failed %d\n", ret); return 0; } return shader_module; } static void inject_local_size_xyz(const uint32_t* code, size_t size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t* dstcode, size_t* dstsize) { uint32_t local_size_x_id = -1; uint32_t local_size_y_id = -1; uint32_t local_size_z_id = -1; uint32_t gl_WorkGroupSize_id = -1; const uint32_t* p = code; uint32_t* dp = dstcode; // skip magic version generator bound schema memcpy(dp, p, 5 * sizeof(uint32_t)); p += 5; dp += 5; // foreach op while ((const unsigned char*)p < (const unsigned char*)code + size) { uint32_t opcode = p[0]; uint16_t wordcount = opcode >> 16; uint16_t op = opcode & 0xffff; if (op == 16) // OpExecutionMode { uint32_t mode = p[2]; if (mode == 17) // LocalSize { memcpy(dp, p, wordcount * sizeof(uint32_t)); // set local_size_xyz dp[3] = local_size_x; dp[4] = local_size_y; dp[5] = local_size_z; p += wordcount; dp += wordcount; continue; } } else if (op == 50) // OpSpecConstant { uint32_t id = p[2]; if (id == local_size_x_id || id == local_size_y_id || id == local_size_z_id) { p += wordcount; continue; } } else if (op == 51) // OpSpecConstantComposite { uint32_t id = p[2]; if (id == gl_WorkGroupSize_id) { if (wordcount == 6 && (p[3] == local_size_x_id || p[4] == local_size_y_id || p[5] == local_size_z_id)) { p += wordcount; continue; } } } else if (op == 71) // OpDecorate { uint32_t id = p[1]; uint32_t decoration = p[2]; if (decoration == 1) // SpecId { uint32_t specid = p[3]; if (specid == 233) local_size_x_id = id; if (specid == 234) local_size_y_id = id; if (specid == 235) local_size_z_id = id; if (specid == 233 || specid == 234 || specid == 235) { p += wordcount; continue; } } else if (decoration == 11) // BuiltIn { uint32_t builtin = p[3]; if (builtin == 25) // WorkgroupSize { gl_WorkGroupSize_id = id; p += wordcount; continue; } } } memcpy(dp, p, wordcount * sizeof(uint32_t)); p += wordcount; dp += wordcount; } *dstsize = (unsigned char*)dp - (unsigned char*)dstcode; } VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const { uint32_t* spv_data_modified = (uint32_t*)malloc(spv_data_size); size_t spv_data_size_modified = spv_data_size; inject_local_size_xyz(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z, spv_data_modified, &spv_data_size_modified); VkShaderModule shader_module = compile_shader_module(spv_data_modified, spv_data_size_modified); free(spv_data_modified); return shader_module; } uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const { // first try, find required and with preferred and without preferred_not for (uint32_t i=0; i& queues = queue_family_index == info.compute_queue_family_index ? compute_queues : queue_family_index == info.graphics_queue_family_index ? graphics_queues : transfer_queues; for (int i=0; i<(int)queues.size(); i++) { VkQueue queue = queues[i]; if (queue) { queues[i] = 0; return queue; } } // out of hardware queue return 0; } void VulkanDevice::reclaim_queue(uint32_t queue_family_index, VkQueue queue) const { if (queue_family_index != info.compute_queue_family_index && queue_family_index != info.graphics_queue_family_index && queue_family_index != info.transfer_queue_family_index) { fprintf(stderr, "invalid queue_family_index %u\n", queue_family_index); return; } MutexLockGuard lock(queue_lock); std::vector& queues = queue_family_index == info.compute_queue_family_index ? compute_queues : queue_family_index == info.graphics_queue_family_index ? graphics_queues : transfer_queues; for (int i=0; i<(int)queues.size(); i++) { if (!queues[i]) { queues[i] = queue; return; } } fprintf(stderr, "FATAL ERROR! reclaim_queue get wild queue %u %p\n", queue_family_index, queue); } VkAllocator* VulkanDevice::acquire_blob_allocator() const { MutexLockGuard lock(blob_allocator_lock); for (int i=0; i<(int)blob_allocators.size(); i++) { VkAllocator* allocator = blob_allocators[i]; if (allocator) { blob_allocators[i] = 0; return allocator; } } // out of blob allocator return 0; } void VulkanDevice::reclaim_blob_allocator(VkAllocator* allocator) const { MutexLockGuard lock(blob_allocator_lock); for (int i=0; i<(int)blob_allocators.size(); i++) { if (!blob_allocators[i]) { blob_allocators[i] = allocator; return; } } fprintf(stderr, "FATAL ERROR! reclaim_blob_allocator get wild allocator %p\n", allocator); } VkAllocator* VulkanDevice::acquire_staging_allocator() const { MutexLockGuard lock(staging_allocator_lock); for (int i=0; i<(int)staging_allocators.size(); i++) { VkAllocator* allocator = staging_allocators[i]; if (allocator) { staging_allocators[i] = 0; return allocator; } } // out of staging allocator return 0; } void VulkanDevice::reclaim_staging_allocator(VkAllocator* allocator) const { MutexLockGuard lock(staging_allocator_lock); for (int i=0; i<(int)staging_allocators.size(); i++) { if (!staging_allocators[i]) { staging_allocators[i] = allocator; return; } } fprintf(stderr, "FATAL ERROR! reclaim_staging_allocator get wild allocator %p\n", allocator); } static inline bool string_ends_with_fp16p(const char* name) { int len = strlen(name); if (len < 6) return false; return memcmp(name + len - 6, "_fp16p", 6) == 0; } static inline bool string_ends_with_fp16pa(const char* name) { int len = strlen(name); if (len < 7) return false; return memcmp(name + len - 7, "_fp16pa", 7) == 0; } static inline bool string_ends_with_fp16s(const char* name) { int len = strlen(name); if (len < 6) return false; return memcmp(name + len - 6, "_fp16s", 6) == 0; } static inline bool string_ends_with_fp16sa(const char* name) { int len = strlen(name); if (len < 7) return false; return memcmp(name + len - 7, "_fp16sa", 7) == 0; } int VulkanDevice::create_shader_module() { if (info.bug_local_size_spec_const) { // do not cache shader module return 0; } shader_modules.resize(layer_shader_registry_entry_count, VK_NULL_HANDLE); for (int i=0; i= 26 if (info.support_VK_ANDROID_external_memory_android_hardware_buffer) { vkGetAndroidHardwareBufferPropertiesANDROID = (PFN_vkGetAndroidHardwareBufferPropertiesANDROID)vkGetDeviceProcAddr(device, "vkGetAndroidHardwareBufferPropertiesANDROID"); vkGetMemoryAndroidHardwareBufferANDROID = (PFN_vkGetMemoryAndroidHardwareBufferANDROID)vkGetDeviceProcAddr(device, "vkGetMemoryAndroidHardwareBufferANDROID"); } #endif // __ANDROID_API__ >= 26 return 0; } VulkanDevice* get_gpu_device(int device_index) { if (device_index < 0 || device_index >= g_gpu_count) return 0; MutexLockGuard lock(g_default_vkdev_lock); if (!g_default_vkdev[device_index]) g_default_vkdev[device_index] = new VulkanDevice(device_index); return g_default_vkdev[device_index]; } const ShaderInfo& get_shader_info(int shader_type_index) { if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count) { fprintf(stderr, "no such shader module %d\n", shader_type_index); return layer_shader_infos[0]; } return layer_shader_infos[shader_type_index]; } ShaderInfo resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size) { uint32_t parameter_id = -233; int specialization_count = 0; int binding_count = 0; int push_constant_count = 0; const uint32_t* p = spv_data; // skip magic version generator bound schema p += 5; // foreach op while ((const unsigned char*)p < (const unsigned char*)spv_data + spv_data_size) { uint32_t opcode = p[0]; uint16_t wordcount = opcode >> 16; uint16_t op = opcode & 0xffff; if (op == 5) // OpName { uint32_t id = p[1]; const char* name = (const char*)&p[2]; if (strcmp(name, "parameter") == 0) { parameter_id = id; } } else if (op == 6) // OpMemberName { uint32_t id = p[1]; if (id == parameter_id) { push_constant_count++; } } else if (op == 71) // OpDecorate { uint32_t decoration = p[2]; if (decoration == 1) // SpecId { specialization_count++; } else if (decoration == 33) // Binding { binding_count++; } } p += wordcount; } ShaderInfo si; si.specialization_count = specialization_count; si.binding_count = binding_count; si.push_constant_count = push_constant_count; return si; } } // namespace ncnn #endif // NCNN_VULKAN