You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu.cpp 34 kB

[WIP] vulkan compute (#618) * vulkan infrastructure * vkallocator and vkmat * layer interface for vulkan compute * wip... * default vulkan device, command wrapper, upload model weight in load_model to simplify layer interface * simplify command api, vkmat holds staging buffer, relu works * initialize specialization constant, simplify command dispatch, fix staging buffer copy with different shape, convolution works * init extension functions * dynamic local size and group count * group count=1 is invalid * regard device max workgroup size limit * fix relu oooops * decouple command record and staging allocation * create result blob * add pooling shader * buffer is faster than image :) * fix pooling shader * add innerproduct shader * readonly writeonly decoration * simplify buffer creation * decouple command and layer, VK_KHR_descriptor_update_template extension makes descriptor binding update easy :D * fix vulkan building issues in visual studio (#1) * fix building issues on visual studio * ignore benchmark * cancel changes * ... ... * decouple paramdict and vulkandevice * fix staging buffer destroy in model loading * remove vkdev member in option * add padding shader * simplify vulkan layer creation, simplify convolution and pooling shader for no padding, less debug output * add convolutiondepthwise and softmax shader * specialization float type, add leakyrelu * add dropout shader * add batchnorm shader * split vulkan forward * add scale shader * push constant type can be int or float * set_optimal_local_size_xyz * add eltwise shader * concat vulkan forward * fix convolution without bias * add dummy shader for concat and split, more fix ... * optional VK_KHR_descriptor_update_template and VK_KHR_push_descriptor * check VK_KHR_push_descriptor for vkCmdPushDescriptorSetWithTemplateKHR * binaryop and unaryop shader * hide raw command buffer * simple vkbenchncnn benchmark * create device with transfer queue * rename command to vkcompute, add vktransfer and layer upload_model interface * external VkMat, copy and map wrt buffer offset * command copy respect offset and size * decouple weight upload and load, simplify upload weight api, use one big staging buffer for uploading weights * fix build on android * binding count can not vary :( * barrier check state, fix sub-op destruction * declare local_size_xyz constant, fix crash on radv * fix local_size_xyz, second try * more barrier and state fix * fix softmax * reconstruct buffer memory allocator, reuse blob buffer, less verbose output * find unified memory type index * weight staging buffer allocator and weight buffer allocator, respect descriptor buffer offset alignment * use VK_KHR_descriptor_update_template for faster descriptor update if available, multithread pipeline creation * find more useful vulkan extensions and enable them * fix msvc build * respect VK_KHR_dedicated_allocation for weight buffer allocation * fix android build * fix bias name conflicts with metal * decouple pipeline and layer, building shader sources into shader module, dedicated create_pipeline api, simplify pipeline recording * drop dummy shader, inplace softmax, multiple shader module works * fix unique queue family index error * flatten support vulkan * mnasnet run * find shader module by name, each entry point per shader module, fix attribute/id conflict on moltenvk * some minor changes * add some high level api * use dedicated transfer queue to upload weight model * prefer mappable buffer on unified memory * global pooling and convolution fc, reuse staging buffer * implement ring-buffer style blob allocator, add VkBufferMemory capacity * use blob allocator for workspace blob, it works fine :) * vulkan option off * Update layer.cpp * fix build with vulkan off * less verbose output, fix crash on vulkan_compute off * merge benchncnn tool * allocator clear api, use new weight buffer allocator per net * add default locked allocator * mapped mat ptr api, persistent mapped memory works generally :) * travis ci linux vulkan * travis ci vulkan wip ... * more gpu wip ... * more gpu wip ... * wip... * wip... * wip... ... * wip... ios vulkan build... * find glslangValidator on ios build * use dynamic moltenvk library * travis ci wip ... * ios simulator does not support metal at all * fix cpu only extractor * optimize workgroup size, first try * optimize workgroup size, second try * conv1x1s1d1 vec4 * revert build system * fix ncnn2mem build * fix ncnn2mem build
7 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "gpu.h"
  15. #if NCNN_VULKAN
  16. #include <vulkan/vulkan.h>
  17. #include <math.h>
  18. #include <stdio.h>
  19. #include <string.h>
  20. #include <algorithm>
  21. #include <vector>
  22. #include "mat.h"
  23. #if __ANDROID__
  24. #define ENABLE_VALIDATION_LAYER 0
  25. #else
  26. #define ENABLE_VALIDATION_LAYER 0
  27. #endif
  28. namespace ncnn {
  29. // global
  30. static VkInstance g_instance = 0;
  31. static int g_gpu_count = 0;
  32. static int g_default_gpu_index = -1;
  33. // NOTE 8 is large enough i think ...
  34. static GpuInfo g_gpu_infos[8];
  35. #if ENABLE_VALIDATION_LAYER
  36. static VkDebugUtilsMessengerEXT callback;
  37. static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(
  38. VkDebugUtilsMessageSeverityFlagBitsEXT /*messageSeverity*/,
  39. VkDebugUtilsMessageTypeFlagsEXT /*messageType*/,
  40. const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData,
  41. void* /*pUserData*/)
  42. {
  43. fprintf(stderr, "validation layer: %s\n", pCallbackData->pMessage);
  44. return VK_FALSE;
  45. }
  46. VkResult CreateDebugUtilsMessengerEXT(VkInstance instance, const VkDebugUtilsMessengerCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugUtilsMessengerEXT* pCallback)
  47. {
  48. PFN_vkCreateDebugUtilsMessengerEXT func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkCreateDebugUtilsMessengerEXT");
  49. if (func)
  50. return func(instance, pCreateInfo, pAllocator, pCallback);
  51. return VK_ERROR_EXTENSION_NOT_PRESENT;
  52. }
  53. void DestroyDebugUtilsMessengerEXT(VkInstance instance, VkDebugUtilsMessengerEXT callback, const VkAllocationCallbacks* pAllocator)
  54. {
  55. PFN_vkDestroyDebugUtilsMessengerEXT func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkDestroyDebugUtilsMessengerEXT");
  56. if (func)
  57. func(instance, callback, pAllocator);
  58. }
  59. #endif // ENABLE_VALIDATION_LAYER
  60. static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
  61. {
  62. // first try, compute only queue
  63. for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
  64. {
  65. const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
  66. if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT) && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
  67. {
  68. return i;
  69. }
  70. }
  71. // second try, any queue with compute
  72. for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
  73. {
  74. const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
  75. if (queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
  76. {
  77. return i;
  78. }
  79. }
  80. // fprintf(stderr, "no compute queue\n");
  81. return -1;
  82. }
  83. static uint32_t find_device_transfer_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
  84. {
  85. // first try, transfer only queue
  86. for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
  87. {
  88. const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
  89. if ((queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT) && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT) && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
  90. {
  91. return i;
  92. }
  93. }
  94. // second try, any queue with transfer
  95. for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
  96. {
  97. const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
  98. if (queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT)
  99. {
  100. return i;
  101. }
  102. }
  103. // third try, use compute queue
  104. uint32_t compute_queue_index = find_device_compute_queue(queueFamilyProperties);
  105. if (compute_queue_index != (uint32_t)-1)
  106. {
  107. return compute_queue_index;
  108. }
  109. // fprintf(stderr, "no transfer queue\n");
  110. return -1;
  111. }
  112. static uint32_t find_unified_memory(VkPhysicalDeviceMemoryProperties physicalDeviceMemoryProperties)
  113. {
  114. // first try, host visible + host coherent + device local
  115. for (uint32_t i=0; i<physicalDeviceMemoryProperties.memoryTypeCount; i++)
  116. {
  117. const VkMemoryType& memoryType = physicalDeviceMemoryProperties.memoryTypes[i];
  118. if ((memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
  119. && (memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
  120. && (memoryType.propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT))
  121. {
  122. return i;
  123. }
  124. }
  125. // second try, host visible + device local
  126. for (uint32_t i=0; i<physicalDeviceMemoryProperties.memoryTypeCount; i++)
  127. {
  128. const VkMemoryType& memoryType = physicalDeviceMemoryProperties.memoryTypes[i];
  129. if ((memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
  130. && (memoryType.propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT))
  131. {
  132. return i;
  133. }
  134. }
  135. // fprintf(stderr, "no unified memory\n");
  136. return -1;
  137. }
  138. static uint32_t find_device_local_memory(VkPhysicalDeviceMemoryProperties physicalDeviceMemoryProperties)
  139. {
  140. // first try, device local only
  141. for (uint32_t i=0; i<physicalDeviceMemoryProperties.memoryTypeCount; i++)
  142. {
  143. const VkMemoryType& memoryType = physicalDeviceMemoryProperties.memoryTypes[i];
  144. if (memoryType.propertyFlags == VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
  145. {
  146. return i;
  147. }
  148. }
  149. // second try, with device local bit
  150. for (uint32_t i=0; i<physicalDeviceMemoryProperties.memoryTypeCount; i++)
  151. {
  152. const VkMemoryType& memoryType = physicalDeviceMemoryProperties.memoryTypes[i];
  153. if (memoryType.propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
  154. {
  155. return i;
  156. }
  157. }
  158. // fprintf(stderr, "no device local memory\n");
  159. return -1;
  160. }
  161. static uint32_t find_host_visible_memory(VkPhysicalDeviceMemoryProperties physicalDeviceMemoryProperties)
  162. {
  163. // first try, host visible + host coherent, without device local bit
  164. for (uint32_t i=0; i<physicalDeviceMemoryProperties.memoryTypeCount; i++)
  165. {
  166. const VkMemoryType& memoryType = physicalDeviceMemoryProperties.memoryTypes[i];
  167. if ((memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
  168. && (memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
  169. && !(memoryType.propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT))
  170. {
  171. return i;
  172. }
  173. }
  174. // second try, with host visible bit, without device local bit
  175. for (uint32_t i=0; i<physicalDeviceMemoryProperties.memoryTypeCount; i++)
  176. {
  177. const VkMemoryType& memoryType = physicalDeviceMemoryProperties.memoryTypes[i];
  178. if ((memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
  179. && !(memoryType.propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT))
  180. {
  181. return i;
  182. }
  183. }
  184. // third try, with host visible bit
  185. for (uint32_t i=0; i<physicalDeviceMemoryProperties.memoryTypeCount; i++)
  186. {
  187. const VkMemoryType& memoryType = physicalDeviceMemoryProperties.memoryTypes[i];
  188. if (memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
  189. {
  190. return i;
  191. }
  192. }
  193. // fprintf(stderr, "no host visible memory\n");
  194. return -1;
  195. }
  196. static int find_default_vulkan_device_index()
  197. {
  198. // first try, discrete gpu
  199. for (int i=0; i<g_gpu_count; i++)
  200. {
  201. if (g_gpu_infos[i].type == 0)
  202. return i;
  203. }
  204. // second try, integrated gpu
  205. for (int i=0; i<g_gpu_count; i++)
  206. {
  207. if (g_gpu_infos[i].type == 1)
  208. return i;
  209. }
  210. // third try, any probed device
  211. if (g_gpu_count > 0)
  212. return 0;
  213. fprintf(stderr, "no vulkan device\n");
  214. return -1;
  215. }
  216. int create_gpu_instance()
  217. {
  218. VkResult ret;
  219. std::vector<const char*> enabledLayers;
  220. #if ENABLE_VALIDATION_LAYER
  221. uint32_t instanceLayerPropertyCount;
  222. ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, NULL);
  223. if (ret != VK_SUCCESS)
  224. {
  225. fprintf(stderr, "vkEnumerateInstanceLayerProperties failed %d\n", ret);
  226. return -1;
  227. }
  228. std::vector<VkLayerProperties> instanceLayerProperties(instanceLayerPropertyCount);
  229. ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, instanceLayerProperties.data());
  230. if (ret != VK_SUCCESS)
  231. {
  232. fprintf(stderr, "vkEnumerateInstanceLayerProperties failed %d\n", ret);
  233. return -1;
  234. }
  235. for (uint32_t i=0; i<instanceLayerPropertyCount; i++)
  236. {
  237. const VkLayerProperties& lp = instanceLayerProperties[i];
  238. // fprintf(stderr, "instance layer %s = %u\n", lp.layerName, lp.implementationVersion);
  239. if (strcmp(lp.layerName, "VK_LAYER_LUNARG_standard_validation") == 0)
  240. {
  241. enabledLayers.push_back("VK_LAYER_LUNARG_standard_validation");
  242. }
  243. if (strcmp(lp.layerName, "VK_LAYER_LUNARG_parameter_validation") == 0)
  244. {
  245. enabledLayers.push_back("VK_LAYER_LUNARG_parameter_validation");
  246. }
  247. }
  248. #endif // ENABLE_VALIDATION_LAYER
  249. std::vector<const char*> enabledExtensions;
  250. uint32_t instanceExtensionPropertyCount;
  251. ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, NULL);
  252. if (ret != VK_SUCCESS)
  253. {
  254. fprintf(stderr, "vkEnumerateInstanceExtensionProperties failed %d\n", ret);
  255. return -1;
  256. }
  257. std::vector<VkExtensionProperties> instanceExtensionProperties(instanceExtensionPropertyCount);
  258. ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, instanceExtensionProperties.data());
  259. if (ret != VK_SUCCESS)
  260. {
  261. fprintf(stderr, "vkEnumerateInstanceExtensionProperties failed %d\n", ret);
  262. return -1;
  263. }
  264. for (uint32_t j=0; j<instanceExtensionPropertyCount; j++)
  265. {
  266. const VkExtensionProperties& exp = instanceExtensionProperties[j];
  267. // fprintf(stderr, "instance extension %s = %u\n", exp.extensionName, exp.specVersion);
  268. if (strcmp(exp.extensionName, "VK_KHR_get_physical_device_properties2") == 0)
  269. {
  270. enabledExtensions.push_back("VK_KHR_get_physical_device_properties2");
  271. }
  272. #if ENABLE_VALIDATION_LAYER
  273. if (strcmp(exp.extensionName, "VK_EXT_debug_utils") == 0)
  274. {
  275. enabledExtensions.push_back("VK_EXT_debug_utils");
  276. }
  277. #endif // ENABLE_VALIDATION_LAYER
  278. }
  279. VkApplicationInfo applicationInfo;
  280. applicationInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
  281. applicationInfo.pNext = 0;
  282. applicationInfo.pApplicationName = "ncnn";
  283. applicationInfo.applicationVersion = 0;
  284. applicationInfo.pEngineName = "ncnn";
  285. applicationInfo.engineVersion = 20181026;
  286. applicationInfo.apiVersion = VK_MAKE_VERSION(1, 0, 0);
  287. VkInstanceCreateInfo instanceCreateInfo;
  288. instanceCreateInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
  289. instanceCreateInfo.pNext = 0;
  290. instanceCreateInfo.flags = 0;
  291. instanceCreateInfo.pApplicationInfo = &applicationInfo;
  292. instanceCreateInfo.enabledLayerCount = enabledLayers.size();
  293. instanceCreateInfo.ppEnabledLayerNames = enabledLayers.data();
  294. instanceCreateInfo.enabledExtensionCount = enabledExtensions.size();
  295. instanceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data();
  296. ret = vkCreateInstance(&instanceCreateInfo, 0, &g_instance);
  297. if (ret != VK_SUCCESS)
  298. {
  299. fprintf(stderr, "vkCreateInstance failed %d\n", ret);
  300. return -1;
  301. }
  302. #if ENABLE_VALIDATION_LAYER
  303. VkDebugUtilsMessengerCreateInfoEXT createInfo = {};
  304. createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
  305. createInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
  306. createInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT;
  307. createInfo.pfnUserCallback = debugCallback;
  308. createInfo.pUserData = 0;
  309. ret = CreateDebugUtilsMessengerEXT(g_instance, &createInfo, nullptr, &callback);
  310. if (ret != VK_SUCCESS)
  311. {
  312. fprintf(stderr, "CreateDebugUtilsMessengerEXT failed %d\n", ret);
  313. return -1;
  314. }
  315. #endif // ENABLE_VALIDATION_LAYER
  316. uint32_t physicalDeviceCount = 0;
  317. ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, 0);
  318. if (ret != VK_SUCCESS)
  319. {
  320. fprintf(stderr, "vkEnumeratePhysicalDevices failed %d\n", ret);
  321. return -1;
  322. }
  323. // NOTE 8 is large enough i think ...
  324. if (physicalDeviceCount > 8)
  325. physicalDeviceCount = 8;
  326. std::vector<VkPhysicalDevice> physicalDevices(physicalDeviceCount);
  327. ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, physicalDevices.data());
  328. if (ret != VK_SUCCESS)
  329. {
  330. fprintf(stderr, "vkEnumeratePhysicalDevices failed %d\n", ret);
  331. return -1;
  332. }
  333. g_gpu_count = physicalDeviceCount;
  334. // find proper device and queue
  335. for (uint32_t i=0; i<physicalDeviceCount; i++)
  336. {
  337. const VkPhysicalDevice& physicalDevice = physicalDevices[i];
  338. GpuInfo& gpu_info = g_gpu_infos[i];
  339. gpu_info.physical_device = physicalDevice;
  340. // device type
  341. VkPhysicalDeviceProperties physicalDeviceProperties;
  342. vkGetPhysicalDeviceProperties(physicalDevice, &physicalDeviceProperties);
  343. // fprintf(stderr, "[%u] apiVersion = %u.%u.%u\n", i, VK_VERSION_MAJOR(physicalDeviceProperties.apiVersion),
  344. // VK_VERSION_MINOR(physicalDeviceProperties.apiVersion), VK_VERSION_PATCH(physicalDeviceProperties.apiVersion));
  345. // fprintf(stderr, "[%u] driverVersion = %u.%u.%u\n", i, VK_VERSION_MAJOR(physicalDeviceProperties.driverVersion),
  346. // VK_VERSION_MINOR(physicalDeviceProperties.driverVersion), VK_VERSION_PATCH(physicalDeviceProperties.driverVersion));
  347. // fprintf(stderr, "[%u] vendorID = %x\n", i, physicalDeviceProperties.vendorID);
  348. // fprintf(stderr, "[%u] deviceID = %x\n", i, physicalDeviceProperties.deviceID);
  349. // fprintf(stderr, "[%u] deviceType = %x\n", i, physicalDeviceProperties.deviceType);
  350. // fprintf(stderr, "[%u] deviceName = %s\n", i, physicalDeviceProperties.deviceName);
  351. // fprintf(stderr, "[%u] pipelineCacheUUID = %u\n", i, physicalDeviceProperties.pipelineCacheUUID);
  352. if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU)
  353. gpu_info.type = 0;
  354. else if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU)
  355. gpu_info.type = 1;
  356. else if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU)
  357. gpu_info.type = 2;
  358. else if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_CPU)
  359. gpu_info.type = 3;
  360. else
  361. gpu_info.type = -1;
  362. // device capability
  363. gpu_info.max_shared_memory_size = physicalDeviceProperties.limits.maxComputeSharedMemorySize;
  364. gpu_info.max_workgroup_count[0] = physicalDeviceProperties.limits.maxComputeWorkGroupCount[0];
  365. gpu_info.max_workgroup_count[1] = physicalDeviceProperties.limits.maxComputeWorkGroupCount[1];
  366. gpu_info.max_workgroup_count[2] = physicalDeviceProperties.limits.maxComputeWorkGroupCount[2];
  367. gpu_info.max_workgroup_invocations = physicalDeviceProperties.limits.maxComputeWorkGroupInvocations;
  368. gpu_info.max_workgroup_size[0] = physicalDeviceProperties.limits.maxComputeWorkGroupSize[0];
  369. gpu_info.max_workgroup_size[1] = physicalDeviceProperties.limits.maxComputeWorkGroupSize[1];
  370. gpu_info.max_workgroup_size[2] = physicalDeviceProperties.limits.maxComputeWorkGroupSize[2];
  371. gpu_info.memory_map_alignment = physicalDeviceProperties.limits.minMemoryMapAlignment;
  372. gpu_info.buffer_offset_alignment = physicalDeviceProperties.limits.minStorageBufferOffsetAlignment;
  373. // fprintf(stderr, "[%u] max_shared_memory_size = %d\n", i, gpu_info.max_shared_memory_size);
  374. // fprintf(stderr, "[%u] max_workgroup_count = %d %d %d\n", i, gpu_info.max_workgroup_count[0], gpu_info.max_workgroup_count[1], gpu_info.max_workgroup_count[2]);
  375. // fprintf(stderr, "[%u] max_workgroup_invocations = %d\n", i, gpu_info.max_workgroup_invocations);
  376. // fprintf(stderr, "[%u] max_workgroup_size = %d %d %d\n", i, gpu_info.max_workgroup_size[0], gpu_info.max_workgroup_size[1], gpu_info.max_workgroup_size[2]);
  377. // fprintf(stderr, "[%u] memory_map_alignment = %lu\n", i, gpu_info.memory_map_alignment);
  378. // fprintf(stderr, "[%u] buffer_offset_alignment = %lu\n", i, gpu_info.buffer_offset_alignment);
  379. // // TODO check features
  380. // VkPhysicalDeviceFeatures features;
  381. // vkGetPhysicalDeviceFeatures(physicalDevice, &features);
  382. //
  383. // // TODO check formatProperties
  384. // VkFormat format = VK_FORMAT_R32_SFLOAT;
  385. // VkFormatProperties formatProperties;
  386. // vkGetPhysicalDeviceFormatProperties(physicalDevice, format, &formatProperties);
  387. // find compute queue
  388. uint32_t queueFamilyPropertiesCount;
  389. vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, 0);
  390. std::vector<VkQueueFamilyProperties> queueFamilyProperties(queueFamilyPropertiesCount);
  391. vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, queueFamilyProperties.data());
  392. gpu_info.compute_queue_index = find_device_compute_queue(queueFamilyProperties);
  393. gpu_info.transfer_queue_index = find_device_transfer_queue(queueFamilyProperties);
  394. // find memory type index
  395. VkPhysicalDeviceMemoryProperties physicalDeviceMemoryProperties;
  396. vkGetPhysicalDeviceMemoryProperties(physicalDevice, &physicalDeviceMemoryProperties);
  397. // // print memory info
  398. // for (uint32_t j=0; j<physicalDeviceMemoryProperties.memoryTypeCount; j++)
  399. // {
  400. // const VkMemoryType& memoryType = physicalDeviceMemoryProperties.memoryTypes[j];
  401. // fprintf(stderr, "[%u] memoryType %u heapIndex/propertyFlags = %d %u\n", i, j, memoryType.heapIndex, memoryType.propertyFlags);
  402. // }
  403. // for (uint32_t j=0; j<physicalDeviceMemoryProperties.memoryHeapCount; j++)
  404. // {
  405. // const VkMemoryHeap& memoryHeap = physicalDeviceMemoryProperties.memoryHeaps[j];
  406. // fprintf(stderr, "[%u] memoryHeap %u size/flags = %lu %u\n", i, j, memoryHeap.size, memoryHeap.flags);
  407. // }
  408. gpu_info.unified_memory_index = find_unified_memory(physicalDeviceMemoryProperties);
  409. gpu_info.device_local_memory_index = find_device_local_memory(physicalDeviceMemoryProperties);
  410. gpu_info.host_visible_memory_index = find_host_visible_memory(physicalDeviceMemoryProperties);
  411. // get device extension
  412. uint32_t deviceExtensionPropertyCount = 0;
  413. ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, NULL);
  414. if (ret != VK_SUCCESS)
  415. {
  416. fprintf(stderr, "vkEnumerateDeviceExtensionProperties failed %d\n", ret);
  417. return -1;
  418. }
  419. std::vector<VkExtensionProperties> deviceExtensionProperties(deviceExtensionPropertyCount);
  420. ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, deviceExtensionProperties.data());
  421. if (ret != VK_SUCCESS)
  422. {
  423. fprintf(stderr, "vkEnumerateDeviceExtensionProperties failed %d\n", ret);
  424. return -1;
  425. }
  426. // extension capability
  427. gpu_info.support_VK_KHR_8bit_storage = 0;
  428. gpu_info.support_VK_KHR_16bit_storage = 0;
  429. gpu_info.support_VK_KHR_bind_memory2 = 0;
  430. gpu_info.support_VK_KHR_dedicated_allocation = 0;
  431. gpu_info.support_VK_KHR_descriptor_update_template = 0;
  432. gpu_info.support_VK_KHR_get_memory_requirements2 = 0;
  433. gpu_info.support_VK_KHR_get_physical_device_properties2 = 0;
  434. gpu_info.support_VK_KHR_push_descriptor = 0;
  435. gpu_info.support_VK_KHR_shader_float16_int8 = 0;
  436. gpu_info.support_VK_KHR_shader_float_controls = 0;
  437. gpu_info.support_VK_KHR_storage_buffer_storage_class = 0;
  438. for (uint32_t j=0; j<deviceExtensionPropertyCount; j++)
  439. {
  440. const VkExtensionProperties& exp = deviceExtensionProperties[j];
  441. // fprintf(stderr, "device extension %s = %u\n", exp.extensionName, exp.specVersion);
  442. if (strcmp(exp.extensionName, "VK_KHR_8bit_storage") == 0)
  443. gpu_info.support_VK_KHR_8bit_storage = exp.specVersion;
  444. else if (strcmp(exp.extensionName, "VK_KHR_16bit_storage") == 0)
  445. gpu_info.support_VK_KHR_16bit_storage = exp.specVersion;
  446. else if (strcmp(exp.extensionName, "VK_KHR_bind_memory2") == 0)
  447. gpu_info.support_VK_KHR_bind_memory2 = exp.specVersion;
  448. else if (strcmp(exp.extensionName, "VK_KHR_dedicated_allocation") == 0)
  449. gpu_info.support_VK_KHR_dedicated_allocation = exp.specVersion;
  450. else if (strcmp(exp.extensionName, "VK_KHR_descriptor_update_template") == 0)
  451. gpu_info.support_VK_KHR_descriptor_update_template = exp.specVersion;
  452. else if (strcmp(exp.extensionName, "VK_KHR_get_memory_requirements2") == 0)
  453. gpu_info.support_VK_KHR_get_memory_requirements2 = exp.specVersion;
  454. else if (strcmp(exp.extensionName, "VK_KHR_get_physical_device_properties2") == 0)
  455. gpu_info.support_VK_KHR_get_physical_device_properties2 = exp.specVersion;
  456. else if (strcmp(exp.extensionName, "VK_KHR_push_descriptor") == 0)
  457. gpu_info.support_VK_KHR_push_descriptor = exp.specVersion;
  458. else if (strcmp(exp.extensionName, "VK_KHR_shader_float16_int8") == 0)
  459. gpu_info.support_VK_KHR_shader_float16_int8 = exp.specVersion;
  460. else if (strcmp(exp.extensionName, "VK_KHR_shader_float_controls") == 0)
  461. gpu_info.support_VK_KHR_shader_float_controls = exp.specVersion;
  462. else if (strcmp(exp.extensionName, "VK_KHR_storage_buffer_storage_class") == 0)
  463. gpu_info.support_VK_KHR_storage_buffer_storage_class = exp.specVersion;
  464. }
  465. // fprintf(stderr, "[%u] VK_KHR_8bit_storage = %d\n", i, gpu_info.support_VK_KHR_8bit_storage);
  466. // fprintf(stderr, "[%u] VK_KHR_16bit_storage = %d\n", i, gpu_info.support_VK_KHR_16bit_storage);
  467. // fprintf(stderr, "[%u] VK_KHR_bind_memory2 = %d\n", i, gpu_info.support_VK_KHR_bind_memory2);
  468. // fprintf(stderr, "[%u] VK_KHR_dedicated_allocation = %d\n", i, gpu_info.support_VK_KHR_dedicated_allocation);
  469. // fprintf(stderr, "[%u] VK_KHR_descriptor_update_template = %d\n", i, gpu_info.support_VK_KHR_descriptor_update_template);
  470. // fprintf(stderr, "[%u] VK_KHR_get_memory_requirements2 = %d\n", i, gpu_info.support_VK_KHR_get_memory_requirements2);
  471. // fprintf(stderr, "[%u] VK_KHR_get_physical_device_properties2 = %d\n", i, gpu_info.support_VK_KHR_get_physical_device_properties2);
  472. // fprintf(stderr, "[%u] VK_KHR_push_descriptor = %d\n", i, gpu_info.support_VK_KHR_push_descriptor);
  473. // fprintf(stderr, "[%u] VK_KHR_shader_float16_int8 = %d\n", i, gpu_info.support_VK_KHR_shader_float16_int8);
  474. // fprintf(stderr, "[%u] VK_KHR_shader_float_controls = %d\n", i, gpu_info.support_VK_KHR_shader_float_controls);
  475. // fprintf(stderr, "[%u] VK_KHR_storage_buffer_storage_class = %d\n", i, gpu_info.support_VK_KHR_storage_buffer_storage_class);
  476. fprintf(stderr, "[%u %s] queueC=%u queueT=%u memU=%u memDL=%u memHV=%u\n", i, physicalDeviceProperties.deviceName,
  477. gpu_info.compute_queue_index, gpu_info.transfer_queue_index,
  478. gpu_info.unified_memory_index, gpu_info.device_local_memory_index, gpu_info.host_visible_memory_index);
  479. }
  480. // the default gpu device
  481. g_default_gpu_index = find_default_vulkan_device_index();
  482. return 0;
  483. }
  484. void destroy_gpu_instance()
  485. {
  486. #if ENABLE_VALIDATION_LAYER
  487. DestroyDebugUtilsMessengerEXT(g_instance, callback, NULL);
  488. #endif // ENABLE_VALIDATION_LAYER
  489. vkDestroyInstance(g_instance, 0);
  490. }
  491. int get_gpu_count()
  492. {
  493. return g_gpu_count;
  494. }
  495. int get_default_gpu_index()
  496. {
  497. return g_default_gpu_index;
  498. }
  499. const GpuInfo& get_gpu_info(int device_index)
  500. {
  501. return g_gpu_infos[device_index];
  502. }
  503. struct layer_shader_registry_entry
  504. {
  505. const char* name;
  506. const uint32_t* spv_data;
  507. size_t spv_data_size;
  508. };
  509. #include "layer_shader_spv_data.h"
  510. static const layer_shader_registry_entry layer_shader_registry[] =
  511. {
  512. #include "layer_shader_registry.h"
  513. };
  514. static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry);
  515. VulkanDevice::VulkanDevice(int device_index) : info(g_gpu_infos[device_index])
  516. {
  517. const float queuePriorities[1] = { 1.f };// 0.f ~ 1.f
  518. std::vector<const char*> enabledExtensions;
  519. if (info.support_VK_KHR_8bit_storage)
  520. enabledExtensions.push_back("VK_KHR_8bit_storage");
  521. if (info.support_VK_KHR_16bit_storage)
  522. enabledExtensions.push_back("VK_KHR_16bit_storage");
  523. if (info.support_VK_KHR_bind_memory2)
  524. enabledExtensions.push_back("VK_KHR_bind_memory2");
  525. if (info.support_VK_KHR_dedicated_allocation)
  526. enabledExtensions.push_back("VK_KHR_dedicated_allocation");
  527. if (info.support_VK_KHR_descriptor_update_template)
  528. enabledExtensions.push_back("VK_KHR_descriptor_update_template");
  529. if (info.support_VK_KHR_get_memory_requirements2)
  530. enabledExtensions.push_back("VK_KHR_get_memory_requirements2");
  531. if (info.support_VK_KHR_get_physical_device_properties2)
  532. enabledExtensions.push_back("VK_KHR_get_physical_device_properties2");
  533. if (info.support_VK_KHR_push_descriptor)
  534. enabledExtensions.push_back("VK_KHR_push_descriptor");
  535. if (info.support_VK_KHR_shader_float16_int8)
  536. enabledExtensions.push_back("VK_KHR_shader_float16_int8");
  537. if (info.support_VK_KHR_shader_float_controls)
  538. enabledExtensions.push_back("VK_KHR_shader_float_controls");
  539. if (info.support_VK_KHR_storage_buffer_storage_class)
  540. enabledExtensions.push_back("VK_KHR_storage_buffer_storage_class");
  541. VkDeviceQueueCreateInfo deviceQueueCreateInfos[2];
  542. deviceQueueCreateInfos[0].sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
  543. deviceQueueCreateInfos[0].pNext = 0;
  544. deviceQueueCreateInfos[0].flags = 0;
  545. deviceQueueCreateInfos[0].queueFamilyIndex = info.compute_queue_index;
  546. deviceQueueCreateInfos[0].queueCount = 1;
  547. deviceQueueCreateInfos[0].pQueuePriorities = queuePriorities;
  548. deviceQueueCreateInfos[1].sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
  549. deviceQueueCreateInfos[1].pNext = 0;
  550. deviceQueueCreateInfos[1].flags = 0;
  551. deviceQueueCreateInfos[1].queueFamilyIndex = info.transfer_queue_index;
  552. deviceQueueCreateInfos[1].queueCount = 1;
  553. deviceQueueCreateInfos[1].pQueuePriorities = queuePriorities;
  554. VkDeviceCreateInfo deviceCreateInfo;
  555. deviceCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
  556. deviceCreateInfo.pNext = 0;
  557. deviceCreateInfo.flags = 0;
  558. if (info.compute_queue_index == info.transfer_queue_index)
  559. {
  560. deviceCreateInfo.queueCreateInfoCount = 1;
  561. }
  562. else
  563. {
  564. deviceCreateInfo.queueCreateInfoCount = 2;
  565. }
  566. deviceCreateInfo.pQueueCreateInfos = deviceQueueCreateInfos;
  567. deviceCreateInfo.enabledLayerCount = 0;
  568. deviceCreateInfo.ppEnabledLayerNames = 0;
  569. deviceCreateInfo.enabledExtensionCount = enabledExtensions.size();
  570. deviceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data();
  571. deviceCreateInfo.pEnabledFeatures = 0;// VkPhysicalDeviceFeatures pointer
  572. VkResult ret = vkCreateDevice(info.physical_device, &deviceCreateInfo, 0, &device);
  573. if (ret != VK_SUCCESS)
  574. {
  575. fprintf(stderr, "vkCreateDevice failed %d\n", ret);
  576. }
  577. init_device_extension();
  578. create_shader_module();
  579. blob_buffer_allocator = new VkBlobBufferAllocator(this);
  580. staging_buffer_allocator = new VkStagingBufferAllocator(this);
  581. }
  582. VulkanDevice::~VulkanDevice()
  583. {
  584. delete blob_buffer_allocator;
  585. delete staging_buffer_allocator;
  586. destroy_shader_module();
  587. vkDestroyDevice(device, 0);
  588. }
  589. VkShaderModule VulkanDevice::get_shader_module(const char* name) const
  590. {
  591. for (int i=0; i<layer_shader_registry_entry_count; i++)
  592. {
  593. if (strcmp(layer_shader_registry[i].name, name) == 0)
  594. return shader_modules[i];
  595. }
  596. fprintf(stderr, "no such shader module %s\n", name);
  597. return 0;
  598. }
  599. VkAllocator* VulkanDevice::allocator() const
  600. {
  601. return blob_buffer_allocator;
  602. }
  603. VkAllocator* VulkanDevice::staging_allocator() const
  604. {
  605. return staging_buffer_allocator;
  606. }
  607. int VulkanDevice::create_shader_module()
  608. {
  609. shader_modules.resize(layer_shader_registry_entry_count, VK_NULL_HANDLE);
  610. for (int i=0; i<layer_shader_registry_entry_count; i++)
  611. {
  612. VkShaderModuleCreateInfo shaderModuleCreateInfo;
  613. shaderModuleCreateInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
  614. shaderModuleCreateInfo.pNext = 0;
  615. shaderModuleCreateInfo.flags = 0;
  616. shaderModuleCreateInfo.codeSize = layer_shader_registry[i].spv_data_size;
  617. shaderModuleCreateInfo.pCode = layer_shader_registry[i].spv_data;
  618. VkResult ret = vkCreateShaderModule(device, &shaderModuleCreateInfo, 0, &shader_modules[i]);
  619. if (ret != VK_SUCCESS)
  620. {
  621. fprintf(stderr, "vkCreateShaderModule %s failed %d\n", layer_shader_registry[i].name, ret);
  622. return -1;
  623. }
  624. // fprintf(stderr, "shader_module %s created\n", layer_shader_registry[i].name);
  625. }
  626. return 0;
  627. }
  628. void VulkanDevice::destroy_shader_module()
  629. {
  630. for (int i=0; i<(int)shader_modules.size(); i++)
  631. {
  632. vkDestroyShaderModule(device, shader_modules[i], 0);
  633. }
  634. shader_modules.clear();
  635. }
  636. int VulkanDevice::init_device_extension()
  637. {
  638. if (info.support_VK_KHR_descriptor_update_template)
  639. {
  640. vkCreateDescriptorUpdateTemplateKHR = (PFN_vkCreateDescriptorUpdateTemplateKHR)vkGetDeviceProcAddr(device, "vkCreateDescriptorUpdateTemplateKHR");
  641. vkDestroyDescriptorUpdateTemplateKHR = (PFN_vkDestroyDescriptorUpdateTemplateKHR)vkGetDeviceProcAddr(device, "vkDestroyDescriptorUpdateTemplateKHR");
  642. vkUpdateDescriptorSetWithTemplateKHR = (PFN_vkUpdateDescriptorSetWithTemplateKHR)vkGetDeviceProcAddr(device, "vkUpdateDescriptorSetWithTemplateKHR");
  643. // fprintf(stderr, "vkCreateDescriptorUpdateTemplateKHR = %p\n", vkCreateDescriptorUpdateTemplateKHR);
  644. // fprintf(stderr, "vkDestroyDescriptorUpdateTemplateKHR = %p\n", vkDestroyDescriptorUpdateTemplateKHR);
  645. // fprintf(stderr, "vkUpdateDescriptorSetWithTemplateKHR = %p\n", vkUpdateDescriptorSetWithTemplateKHR);
  646. }
  647. if (info.support_VK_KHR_get_memory_requirements2)
  648. {
  649. vkGetImageMemoryRequirements2KHR = (PFN_vkGetImageMemoryRequirements2KHR)vkGetDeviceProcAddr(device, "vkGetImageMemoryRequirements2KHR");
  650. vkGetBufferMemoryRequirements2KHR = (PFN_vkGetBufferMemoryRequirements2KHR)vkGetDeviceProcAddr(device, "vkGetBufferMemoryRequirements2KHR");
  651. vkGetImageSparseMemoryRequirements2KHR = (PFN_vkGetImageSparseMemoryRequirements2KHR)vkGetDeviceProcAddr(device, "vkGetImageSparseMemoryRequirements2KHR");
  652. // fprintf(stderr, "vkGetImageMemoryRequirements2KHR = %p\n", vkGetImageMemoryRequirements2KHR);
  653. // fprintf(stderr, "vkGetBufferMemoryRequirements2KHR = %p\n", vkGetBufferMemoryRequirements2KHR);
  654. // fprintf(stderr, "vkGetImageSparseMemoryRequirements2KHR = %p\n", vkGetImageSparseMemoryRequirements2KHR);
  655. }
  656. if (info.support_VK_KHR_push_descriptor)
  657. {
  658. if (info.support_VK_KHR_descriptor_update_template)
  659. {
  660. vkCmdPushDescriptorSetWithTemplateKHR = (PFN_vkCmdPushDescriptorSetWithTemplateKHR)vkGetDeviceProcAddr(device, "vkCmdPushDescriptorSetWithTemplateKHR");
  661. // fprintf(stderr, "vkCmdPushDescriptorSetWithTemplateKHR = %p\n", vkCmdPushDescriptorSetWithTemplateKHR);
  662. }
  663. vkCmdPushDescriptorSetKHR = (PFN_vkCmdPushDescriptorSetKHR)vkGetDeviceProcAddr(device, "vkCmdPushDescriptorSetKHR");
  664. // fprintf(stderr, "vkCmdPushDescriptorSetKHR = %p\n", vkCmdPushDescriptorSetKHR);
  665. }
  666. return 0;
  667. }
  668. } // namespace ncnn
  669. #endif // NCNN_VULKAN