You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

_collect_npu.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """The npu collector."""
  16. import inspect
  17. from collections import defaultdict
  18. from ctypes import CDLL, Structure, byref, c_char, c_int, c_uint, c_ulong, c_ushort
  19. from functools import lru_cache, wraps
  20. from threading import Lock, Thread
  21. from mindinsight.sysmetric.common.exceptions import DsmiQueryingException
  22. from mindinsight.sysmetric.common.log import logger
  23. def _timeout(seconds, default):
  24. """
  25. The timeout decorator wait for specified seconds or return the default value.
  26. Args:
  27. seconds (float): The specified seconds.
  28. default (Any): The default value.
  29. """
  30. def outer(fn):
  31. cached, lockdict = {}, defaultdict(Lock)
  32. def target(*args):
  33. lock = lockdict[args]
  34. if lock.acquire(blocking=False):
  35. try:
  36. cached[args] = fn(*args)
  37. finally:
  38. lock.release()
  39. else:
  40. logger.debug('%s%r skipped.', fn.__name__, args)
  41. @wraps(fn)
  42. def inner(*args):
  43. thread = Thread(target=target, args=args, daemon=True)
  44. thread.start()
  45. thread.join(seconds)
  46. if thread.is_alive():
  47. logger.debug('%s%r timeouted.', fn.__name__, args)
  48. return cached.get(args, default)
  49. return inner
  50. return outer
  51. def _fallback_to_prev_result(fn):
  52. """Fallback to previous successful result when failing."""
  53. prev_result = None
  54. @wraps(fn)
  55. def wrap(*args):
  56. nonlocal prev_result
  57. sucess, result = fn(*args)
  58. if sucess:
  59. prev_result = result
  60. return sucess, result
  61. if prev_result is not None:
  62. return sucess, prev_result
  63. raise RuntimeError(f'{fn.__name__} querying failed and no previous successful result.')
  64. return wrap
  65. def _libsmicall(*args):
  66. """
  67. Call the lib function to querying NPU metrics.
  68. Returns:
  69. bool, True when success of querying, False otherwise.
  70. """
  71. if not libsmi:
  72. logger.error('Trying to call the libdrvdsmi_host which is not loaded.')
  73. raise ValueError('Trying to call the libdrvdsmi_host which is not loaded.')
  74. fname = inspect.stack()[1].function
  75. error_code = getattr(libsmi, fname)(*args)
  76. if error_code != 0:
  77. logger.error('%s querying failed with error code %d.', fname, error_code)
  78. return error_code == 0
  79. @lru_cache(maxsize=4)
  80. def dsmi_get_device_count():
  81. """
  82. Get device count.
  83. Returns:
  84. int, the device count.
  85. Raises:
  86. RuntimeError, when querying dsmi returning non-zero.
  87. """
  88. device_count = c_int()
  89. if _libsmicall(byref(device_count)):
  90. return device_count.value
  91. raise RuntimeError('Querying device count failed.')
  92. @lru_cache(maxsize=4)
  93. def dsmi_list_device(count):
  94. """
  95. List the device IDs.
  96. Args:
  97. count (int): The device count.
  98. Returns:
  99. List[int], the device IDs.
  100. Raises:
  101. RuntimeError, when querying dsmi returning non-zero.
  102. """
  103. device_id_array = c_int * count
  104. device_id_list = device_id_array()
  105. count = c_int(count)
  106. if _libsmicall(device_id_list, count):
  107. return list(device_id_list)
  108. raise RuntimeError('Querying device id list failed.')
  109. @lru_cache(maxsize=8)
  110. @_fallback_to_prev_result
  111. def dsmi_get_chip_info(device_id):
  112. """
  113. Get chip info.
  114. Args:
  115. device_id (int): The specific device id.
  116. Returns:
  117. dict, the chip info:
  118. - chip_type (str): The chip type.
  119. - chip_name (str): The chip name.
  120. - chip_ver (str): The chip name.
  121. Raises:
  122. RuntimeError, when querying dsmi returning non-zero.
  123. """
  124. class ChipInfoStruct(Structure):
  125. _fields_ = [('chip_type', c_char * 32), ('chip_name', c_char * 32), ('chip_ver', c_char * 32)]
  126. device_id = c_int(device_id)
  127. chip_info = ChipInfoStruct()
  128. success = _libsmicall(device_id, byref(chip_info))
  129. return success, {
  130. 'chip_type': chip_info.chip_type.decode('utf-8'),
  131. 'chip_name': chip_info.chip_name.decode('utf-8'),
  132. 'chip_ver': chip_info.chip_ver.decode('utf-8')
  133. }
  134. @_fallback_to_prev_result
  135. def dsmi_get_device_health(device_id):
  136. """
  137. Get device health.
  138. Args:
  139. device_id (int): The specific device id.
  140. Returns:
  141. int, 0 indicats normal, 1 minor alarm, 2 major alarm, 3 critical alarm, 0xffffffff device not found.
  142. Raises:
  143. RuntimeError, when querying dsmi returning non-zero.
  144. """
  145. device_id = c_int(device_id)
  146. health = c_uint()
  147. success = _libsmicall(device_id, byref(health))
  148. return success, health.value
  149. @lru_cache(maxsize=8)
  150. @_fallback_to_prev_result
  151. def dsmi_get_device_ip_address(device_id):
  152. """
  153. Get device IP address.
  154. Args:
  155. device_id (int): The specific device ID.
  156. Returns:
  157. dict, the device IP address:
  158. - ip_address (str): the IP address.
  159. - mask_address (str): the mask address.
  160. Raises:
  161. RuntimeError, when querying dsmi returning non-zero.
  162. """
  163. is_ipv6, port_type, port_id = False, 1, 0
  164. class Ipaddrstruct(Structure):
  165. _fields_ = [('u_addr', c_char * (16 if is_ipv6 else 4)), ('ip_type', c_int)]
  166. ip_type = c_int(1 if is_ipv6 else 0)
  167. device_id = c_int(device_id)
  168. ip_address = Ipaddrstruct(b'', ip_type)
  169. mask_address = Ipaddrstruct(b'', ip_type)
  170. success = _libsmicall(device_id, port_type, port_id, byref(ip_address), byref(mask_address))
  171. def pad(u_addr):
  172. for i in range(4):
  173. if i < len(u_addr):
  174. yield u_addr[i]
  175. else:
  176. yield 0
  177. return success, {
  178. 'ip_address': '.'.join(str(c) for c in pad(ip_address.u_addr)),
  179. 'mask_address': '.'.join(str(c) for c in pad(mask_address.u_addr))
  180. }
  181. @_fallback_to_prev_result
  182. def dsmi_get_hbm_info(device_id):
  183. """
  184. Get the HBM info.
  185. Args:
  186. device_id (int): The specific device id.
  187. Returns:
  188. dict, the HBM info:
  189. memory_size (int), The total HBM memory, in KB.
  190. frep (int), The HBM frequency, in MHZ.
  191. memory_usage (int), The used HBM memory, in KB.
  192. temp (int), The HBM temperature, in °C.
  193. bandwith_util_rate (int): The bandwith util rate, in %.
  194. Raises:
  195. RuntimeError, when querying dsmi returning non-zero.
  196. """
  197. class HbmInfoStruct(Structure):
  198. _fields_ = [('memory_size', c_ulong), ('freq', c_uint), ('memory_usage', c_ulong), ('temp', c_int),
  199. ('bandwith_util_rate', c_uint)]
  200. device_id = c_int(device_id)
  201. hbm_info = HbmInfoStruct()
  202. success = _libsmicall(device_id, byref(hbm_info))
  203. return success, {
  204. 'memory_size': hbm_info.memory_size,
  205. 'freq': hbm_info.freq,
  206. 'memory_usage': hbm_info.memory_usage,
  207. 'temp': hbm_info.temp,
  208. 'bandwith_util_rate': hbm_info.bandwith_util_rate
  209. }
  210. @_timeout(0.2, -1)
  211. def dsmi_get_device_utilization_rate(device_id, device_type):
  212. """
  213. Get device utilization rate, %.
  214. Note: Query AI Core when profiling turns on will return failure.
  215. Args:
  216. device_id (int): The specific device id
  217. device_type (int): The device type, 1 for memory, 2 AI Core, 5 memory bandwidth, 6 HBM, 10 HBM bandwidth.
  218. Returns:
  219. int, the utilization rate, returning -1 to indicate querying failed.
  220. """
  221. device_id = c_int(device_id)
  222. device_type = c_int(device_type)
  223. utilization_rate = c_uint()
  224. if _libsmicall(device_id, device_type, byref(utilization_rate)):
  225. return utilization_rate.value
  226. return -1
  227. @_fallback_to_prev_result
  228. def dsmi_get_device_power_info(device_id):
  229. """
  230. Get the device power.
  231. Args:
  232. device_id (int): The specific device id.
  233. Returns:
  234. dict, the device power info.
  235. - power, the device power, in Watt.
  236. Raises:
  237. RuntimeError, when querying dsmi returning non-zero.
  238. """
  239. class PowerInfoStruct(Structure):
  240. _fields_ = [('power', c_ushort)]
  241. power_info = PowerInfoStruct()
  242. device_id = c_int(device_id)
  243. success = _libsmicall(device_id, byref(power_info))
  244. return success, {'power': round(power_info.power * 0.1, 2)}
  245. @_fallback_to_prev_result
  246. def dsmi_get_device_temperature(device_id):
  247. """
  248. Get the device temperature.
  249. Args:
  250. device_id (int): The specific device id.
  251. Returns:
  252. int, the device temperature, in °C.
  253. Raises:
  254. RuntimeError, when querying dsmi returning non-zero.
  255. """
  256. device_id = c_int(device_id)
  257. temperature = c_uint()
  258. success = _libsmicall(device_id, byref(temperature))
  259. return success, temperature.value
  260. def collect_npu():
  261. """Collect the metrics for each NPUs.
  262. Returns:
  263. List[dict], the metrics of each NPUs.
  264. Raises:
  265. DsmiQueryingException, when querying dsmi returning non-zero.
  266. """
  267. try:
  268. return _collect_npus()
  269. except RuntimeError as e:
  270. logger.warning(e.args[0])
  271. raise DsmiQueryingException(e.args[0])
  272. def _collect_npus():
  273. """Collect the metrics for each NPUs.
  274. Returns:
  275. List[dict], the metrics of each NPUs.
  276. Raises:
  277. RuntimeError, when querying dsmi returning non-zero.
  278. """
  279. if not libsmi:
  280. return None
  281. count = dsmi_get_device_count()
  282. device_ids = dsmi_list_device(count)
  283. npus = []
  284. for device_id in device_ids:
  285. npu = _collect_one(device_id)
  286. npus.append(npu)
  287. return npus
  288. def _collect_one(device_id):
  289. """
  290. Collect NPU info by the device_id.
  291. Args:
  292. device_id (int): The specific device id.
  293. Returns:
  294. dict, the NPU info.
  295. Raises:
  296. RuntimeError, when querying dsmi returning non-zero.
  297. """
  298. kb_to_mb, memory_threshold, success = 1024, 4, [True] * 6
  299. success[0], health = dsmi_get_device_health(device_id)
  300. success[1], hbm_info = dsmi_get_hbm_info(device_id)
  301. success[2], chip_info = dsmi_get_chip_info(device_id)
  302. success[3], ip_addr = dsmi_get_device_ip_address(device_id)
  303. success[4], power_info = dsmi_get_device_power_info(device_id)
  304. success[5], temperature = dsmi_get_device_temperature(device_id)
  305. aicore_rate = dsmi_get_device_utilization_rate(device_id, 2)
  306. return {
  307. 'chip_name': chip_info.get('chip_name'),
  308. 'device_id': device_id,
  309. 'available': all(success) and health == 0 and hbm_info.get('memory_usage', 0) // kb_to_mb < memory_threshold,
  310. 'health': health,
  311. 'ip_address': ip_addr.get('ip_address'),
  312. 'aicore_rate': aicore_rate,
  313. 'hbm_info': {
  314. 'memory_size': hbm_info.get('memory_size') // kb_to_mb,
  315. 'memory_usage': hbm_info.get('memory_usage') // kb_to_mb
  316. },
  317. 'power': power_info.get('power'),
  318. 'temperature': temperature,
  319. 'success': all(success)
  320. }
  321. try:
  322. libsmi = CDLL('libdrvdsmi_host.so')
  323. except OSError:
  324. logger.info('Failed to load libdrvdsmi_host.so.')
  325. libsmi = None