You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

explain_manager.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """ExplainManager."""
  16. import os
  17. import threading
  18. import time
  19. from mindinsight.datavisual.common import exceptions
  20. from mindinsight.datavisual.common.enums import BaseEnum
  21. from mindinsight.explainer.common.log import logger
  22. from mindinsight.explainer.manager.explain_job import ExplainJob
  23. from mindinsight.datavisual.data_access.file_handler import FileHandler
  24. from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher
  25. from mindinsight.utils.exceptions import MindInsightException, ParamValueError
  26. _MAX_LOADER_NUM = 3
  27. _MAX_INTERVAL = 3
  28. class _ExplainManagerStatus(BaseEnum):
  29. """Manager status."""
  30. INIT = 'INIT'
  31. LOADING = 'LOADING'
  32. DONE = 'DONE'
  33. INVALID = 'INVALID'
  34. class ExplainManager:
  35. """ExplainManager."""
  36. def __init__(self, summary_base_dir: str):
  37. self._summary_base_dir = summary_base_dir
  38. self._loader_pool = {}
  39. self._deleted_ids = []
  40. self._status = _ExplainManagerStatus.INIT.value
  41. self._status_mutex = threading.Lock()
  42. self._loader_pool_mutex = threading.Lock()
  43. self._max_loader_num = _MAX_LOADER_NUM
  44. self._reload_interval = None
  45. def _reload_data(self):
  46. """periodically load summary from file."""
  47. while True:
  48. self._load_data()
  49. if not self._reload_interval:
  50. break
  51. time.sleep(self._reload_interval)
  52. def _load_data(self):
  53. """Loading the summary in the given base directory."""
  54. logger.info(
  55. 'Start to load data, reload interval: %r.', self._reload_interval)
  56. with self._status_mutex:
  57. if self._status == _ExplainManagerStatus.LOADING.value:
  58. logger.info('Current status is %s, will ignore to load data.',
  59. self._status)
  60. return
  61. self._status = _ExplainManagerStatus.LOADING.value
  62. self._generate_loaders()
  63. self._execute_load_data()
  64. if not self._loader_pool:
  65. self._status = _ExplainManagerStatus.INVALID.value
  66. else:
  67. self._status = _ExplainManagerStatus.DONE.value
  68. logger.info('Load event data end, status: %r, '
  69. 'and loader pool size is %r',
  70. self._status, len(self._loader_pool))
  71. def _update_loader_latest_update_time(self, loader_id, latest_update_time=None):
  72. """update the update time of loader of given id."""
  73. if latest_update_time is None:
  74. latest_update_time = time.time()
  75. self._loader_pool[loader_id].latest_update_time = latest_update_time
  76. def _delete_loader(self, loader_id):
  77. """delete loader given loader_id"""
  78. if self._loader_pool.get(loader_id, None) is not None:
  79. self._loader_pool.pop(loader_id)
  80. logger.debug('delete loader %s', loader_id)
  81. def _add_loader(self, loader):
  82. """add loader to the loader_pool."""
  83. if len(self._loader_pool) >= _MAX_LOADER_NUM:
  84. delete_num = len(self._loader_pool) - _MAX_LOADER_NUM + 1
  85. sorted_loaders = sorted(
  86. self._loader_pool.items(),
  87. key=lambda x: x[1].latest_update_time)
  88. for index in range(delete_num):
  89. delete_loader_id = sorted_loaders[index][0]
  90. self._delete_loader(delete_loader_id)
  91. self._loader_pool.update({loader.loader_id: loader})
  92. def _deal_loaders(self, latest_loaders):
  93. """"update the loader pool."""
  94. with self._loader_pool_mutex:
  95. for loader_id, loader in latest_loaders:
  96. if self._loader_pool.get(loader_id, None) is None:
  97. self._add_loader(loader)
  98. continue
  99. if (self._loader_pool[loader_id].latest_update_time
  100. < loader.latest_update_time):
  101. self._update_loader_latest_update_time(
  102. loader_id, loader.latest_update_time)
  103. @staticmethod
  104. def _generate_loader_id(relative_path):
  105. """Generate loader id for given path"""
  106. loader_id = relative_path
  107. return loader_id
  108. @staticmethod
  109. def _generate_loader_name(relative_path):
  110. """Generate_loader name for given path."""
  111. loader_name = relative_path
  112. return loader_name
  113. def _generate_loader_by_relative_path(self, relative_path: str) -> ExplainJob:
  114. """Generate explain job from given relative path."""
  115. current_dir = os.path.realpath(FileHandler.join(
  116. self._summary_base_dir, relative_path
  117. ))
  118. loader_id = self._generate_loader_id(relative_path)
  119. loader = ExplainJob(
  120. job_id=loader_id,
  121. summary_dir=current_dir,
  122. create_time=ExplainJob.get_create_time(current_dir),
  123. latest_update_time=ExplainJob.get_update_time(current_dir))
  124. return loader
  125. def _generate_loaders(self):
  126. """Generate job loaders from the summary watcher."""
  127. dir_map_mtime_dict = {}
  128. loader_dict = {}
  129. min_modify_time = None
  130. _, summaries = SummaryWatcher().list_explain_directories(
  131. self._summary_base_dir)
  132. for item in summaries:
  133. relative_path = item.get('relative_path')
  134. modify_time = item.get('update_time').timestamp()
  135. loader_id = self._generate_loader_id(relative_path)
  136. loader = self._loader_pool.get(loader_id, None)
  137. if loader is not None and loader.latest_update_time > modify_time:
  138. modify_time = loader.latest_update_time
  139. if min_modify_time is None:
  140. min_modify_time = modify_time
  141. if len(dir_map_mtime_dict) < _MAX_LOADER_NUM:
  142. if modify_time < min_modify_time:
  143. min_modify_time = modify_time
  144. dir_map_mtime_dict.update({relative_path: modify_time})
  145. else:
  146. if modify_time >= min_modify_time:
  147. dir_map_mtime_dict.update({relative_path: modify_time})
  148. sorted_dir_tuple = sorted(dir_map_mtime_dict.items(),
  149. key=lambda d: d[1])[-_MAX_LOADER_NUM:]
  150. for relative_path, modify_time in sorted_dir_tuple:
  151. loader_id = self._generate_loader_id(relative_path)
  152. loader = self._generate_loader_by_relative_path(relative_path)
  153. loader_dict.update({loader_id: loader})
  154. sorted_loaders = sorted(loader_dict.items(),
  155. key=lambda x: x[1].latest_update_time)
  156. latest_loaders = sorted_loaders[-_MAX_LOADER_NUM:]
  157. self._deal_loaders(latest_loaders)
  158. def _execute_loader(self, loader_id):
  159. """Execute the data loading."""
  160. try:
  161. with self._loader_pool_mutex:
  162. loader = self._loader_pool.get(loader_id, None)
  163. if loader is None:
  164. logger.debug('Loader %r has been deleted, will not load'
  165. 'data', loader_id)
  166. return
  167. loader.load()
  168. except MindInsightException as e:
  169. logger.warning('Data loader %r load data failed. Delete data_loader. Detail: %s', loader_id, e)
  170. with self._loader_pool_mutex:
  171. self._delete_loader(loader_id)
  172. def _execute_load_data(self):
  173. """Execute the loader in the pool to load data."""
  174. loader_pool = self._get_snapshot_loader_pool()
  175. for loader_id in loader_pool:
  176. self._execute_loader(loader_id)
  177. def _get_snapshot_loader_pool(self):
  178. """Get snapshot of loader_pool."""
  179. with self._loader_pool_mutex:
  180. return dict(self._loader_pool)
  181. def _check_status_valid(self):
  182. """Check manager status."""
  183. if self._status == _ExplainManagerStatus.INIT.value:
  184. raise exceptions.SummaryLogIsLoading('Data is loading, current status is %s' % self._status)
  185. @staticmethod
  186. def _check_train_id_valid(train_id: str):
  187. """Verify the train_id is valid."""
  188. if not train_id.startswith('./'):
  189. logger.warning('train_id does not start with "./"')
  190. return False
  191. if len(train_id.split('/')) > 2:
  192. logger.warning('train_id contains multiple "/"')
  193. return False
  194. return True
  195. def _check_train_job_exist(self, train_id):
  196. """Verify thee train_job is existed given train_id."""
  197. if train_id in self._loader_pool:
  198. return
  199. self._check_train_id_valid(train_id)
  200. if SummaryWatcher().is_summary_directory(self._summary_base_dir, train_id):
  201. return
  202. raise ParamValueError('Can not find the train job in the manager, train_id: %s' % train_id)
  203. def _reload_data_again(self):
  204. """Reload the data one more time."""
  205. logger.debug('Start to reload data again.')
  206. thread = threading.Thread(target=self._load_data,
  207. name='reload_data_thread')
  208. thread.daemon = False
  209. thread.start()
  210. def _get_job(self, train_id):
  211. """Retrieve train_job given train_id."""
  212. is_reload = False
  213. with self._loader_pool_mutex:
  214. loader = self._loader_pool.get(train_id, None)
  215. if loader is None:
  216. relative_path = train_id
  217. temp_loader = self._generate_loader_by_relative_path(
  218. relative_path)
  219. if temp_loader is None:
  220. return None
  221. self._add_loader(temp_loader)
  222. is_reload = True
  223. if is_reload:
  224. self._reload_data_again()
  225. return loader
  226. @property
  227. def summary_base_dir(self):
  228. """Return the base directory for summary records."""
  229. return self._summary_base_dir
  230. def get_job(self, train_id):
  231. """
  232. Return ExplainJob given train_id.
  233. If explain job w.r.t given train_id is not found, None will be returned.
  234. Args:
  235. train_id (str): The id of expected ExplainJob
  236. Return:
  237. explain_job
  238. """
  239. self._check_status_valid()
  240. self._check_train_job_exist(train_id)
  241. loader = self._get_job(train_id)
  242. if loader is None:
  243. return None
  244. return loader
  245. def start_load_data(self,
  246. reload_interval=_MAX_INTERVAL):
  247. """
  248. Start threads for loading data.
  249. Args:
  250. reload_interval (int): interval to reload the summary from file
  251. """
  252. self._reload_interval = reload_interval
  253. thread = threading.Thread(target=self._reload_data, name='start_load_data_thread')
  254. thread.daemon = True
  255. thread.start()
  256. # wait for data loading
  257. time.sleep(1)