You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

_check_version.py 19 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """version and config check"""
  16. import os
  17. import sys
  18. import subprocess
  19. from pathlib import Path
  20. from abc import abstractmethod, ABCMeta
  21. import numpy as np
  22. from packaging import version
  23. from . import log as logger
  24. from .version import __version__
  25. from .default_config import __package_name__
  26. class EnvChecker(metaclass=ABCMeta):
  27. """basic class for environment check"""
  28. @abstractmethod
  29. def check_env(self, e):
  30. pass
  31. @abstractmethod
  32. def set_env(self):
  33. pass
  34. @abstractmethod
  35. def check_version(self):
  36. pass
  37. class GPUEnvChecker(EnvChecker):
  38. """GPU environment check."""
  39. def __init__(self):
  40. self.version = ["10.1", "11.1"]
  41. self.lib_key_to_lib_name = {'libcu': 'libcuda.so'}
  42. # env
  43. self.path = os.getenv("PATH")
  44. self.ld_lib_path = os.getenv("LD_LIBRARY_PATH")
  45. # check
  46. self.v = "0"
  47. self.cuda_lib_path = self._get_lib_path("libcu")
  48. self.cuda_bin_path = self._get_bin_path("cuda")
  49. self.cudnn_lib_path = self._get_lib_path("libcudnn")
  50. def check_env(self, e):
  51. raise e
  52. def set_env(self):
  53. return
  54. def _get_bin_path(self, bin_name):
  55. """Get bin path by bin name."""
  56. if bin_name == "cuda":
  57. return self._get_cuda_bin_path()
  58. return []
  59. def _get_cuda_bin_path(self):
  60. """Get cuda bin path by lib path."""
  61. path_list = []
  62. for path in self.cuda_lib_path:
  63. path = os.path.abspath(path.strip()+"/bin/")
  64. if Path(path).is_dir():
  65. path_list.append(path)
  66. return np.unique(path_list)
  67. def _get_nvcc_version(self, is_set_env):
  68. """Get cuda version by nvcc command."""
  69. nvcc_result = subprocess.run(["nvcc --version | grep release"],
  70. timeout=3, text=True, capture_output=True, check=False, shell=True)
  71. if nvcc_result.returncode:
  72. if not is_set_env:
  73. for path in self.cuda_bin_path:
  74. if Path(path + "/nvcc").is_file():
  75. os.environ['PATH'] = path + ":" + os.environ['PATH']
  76. return self._get_nvcc_version(True)
  77. return ""
  78. result = nvcc_result.stdout
  79. for line in result.split('\n'):
  80. if line:
  81. return line.strip().split("release")[1].split(",")[0].strip()
  82. return ""
  83. def _get_cudnn_version(self):
  84. """Get cudnn version by libcudnn.so."""
  85. cudnn_version = []
  86. for path in self.cudnn_lib_path:
  87. ls_cudnn = subprocess.run(["ls " + path + "/lib64/libcudnn.so.*.*"], timeout=10, text=True,
  88. capture_output=True, check=False, shell=True)
  89. if ls_cudnn.returncode == 0:
  90. cudnn_version = ls_cudnn.stdout.split('/')[-1].strip('libcudnn.so.').strip().split('.')
  91. if len(cudnn_version) == 2:
  92. cudnn_version.append('0')
  93. break
  94. version_str = ''.join([n for n in cudnn_version])
  95. return version_str
  96. def check_version(self):
  97. """Check cuda version."""
  98. version_match = False
  99. for path in self.cuda_lib_path:
  100. version_file = path + "/version.txt"
  101. if not Path(version_file).is_file():
  102. continue
  103. if self._check_version(version_file):
  104. version_match = True
  105. break
  106. if not version_match:
  107. if self.v == "0":
  108. logger.warning("Cuda version file version.txt is not found, please confirm that the correct "
  109. "cuda version has been installed, you can refer to the "
  110. "installation guidelines: https://www.mindspore.cn/install")
  111. else:
  112. logger.warning(f"MindSpore version {__version__} and cuda version {self.v} does not match, "
  113. "please refer to the installation guide for version matching "
  114. "information: https://www.mindspore.cn/install")
  115. nvcc_version = self._get_nvcc_version(False)
  116. if nvcc_version and (nvcc_version not in self.version):
  117. logger.warning(f"MindSpore version {__version__} and nvcc(cuda bin) version {nvcc_version} "
  118. "does not match, please refer to the installation guide for version matching "
  119. "information: https://www.mindspore.cn/install")
  120. cudnn_version = self._get_cudnn_version()
  121. if cudnn_version and int(cudnn_version) < 760:
  122. logger.warning(f"MindSpore version {__version__} and cudDNN version {cudnn_version} "
  123. "does not match, please refer to the installation guide for version matching "
  124. "information: https://www.mindspore.cn/install. The recommended version is "
  125. "CUDA10.1 with cuDNN7.6.x and CUAD11.1 with cuDNN8.0.x")
  126. if cudnn_version and int(cudnn_version) < 800 and int(str(self.v).split('.')[0]) > 10:
  127. logger.warning(f"CUDA version {self.v} and cuDNN version {cudnn_version} "
  128. "does not match, please refer to the installation guide for version matching "
  129. "information: https://www.mindspore.cn/install. The recommended version is "
  130. "CUAD11.1 with cuDNN8.0.x")
  131. def _check_version(self, version_file):
  132. """Check cuda version by version.txt."""
  133. v = self._read_version(version_file)
  134. v = version.parse(v)
  135. v_str = str(v.major) + "." + str(v.minor)
  136. if v_str not in self.version:
  137. return False
  138. return True
  139. def _get_lib_path(self, lib_name):
  140. """Get gpu lib path by ldd command."""
  141. path_list = []
  142. current_path = os.path.split(os.path.realpath(__file__))[0]
  143. try:
  144. ldd_result = subprocess.run(["ldd " + current_path + "/_c_expression*.so* | grep " + lib_name],
  145. timeout=10, text=True, capture_output=True, check=False, shell=True)
  146. if ldd_result.returncode:
  147. logger.error(f"{self.lib_key_to_lib_name[lib_name]} (need by mindspore-gpu) is not found, please "
  148. f"confirm that _c_expression.so is in directory:{current_path} and the correct cuda "
  149. "version has been installed, you can refer to the installation "
  150. "guidelines: https://www.mindspore.cn/install")
  151. return path_list
  152. result = ldd_result.stdout
  153. for i in result.split('\n'):
  154. path = i.partition("=>")[2]
  155. if path.lower().find("not found") > 0:
  156. logger.warning(f"Cuda {self.version} version(need by mindspore-gpu) is not found, please confirm "
  157. "that the path of cuda is set to the env LD_LIBRARY_PATH, please refer to the "
  158. "installation guidelines: https://www.mindspore.cn/install")
  159. continue
  160. path = path.partition(lib_name)[0]
  161. if path:
  162. path_list.append(os.path.abspath(path.strip() + "../"))
  163. return np.unique(path_list)
  164. except subprocess.TimeoutExpired:
  165. logger.warning("Failed to check cuda version due to the ldd command timeout, please confirm that "
  166. "the correct cuda version has been installed, you can refer to the "
  167. "installation guidelines: https://www.mindspore.cn/install")
  168. return path_list
  169. def _read_version(self, file_path):
  170. """Get gpu version info in version.txt."""
  171. with open(file_path, 'r') as f:
  172. all_info = f.readlines()
  173. for line in all_info:
  174. if line.startswith("CUDA Version"):
  175. self.v = line.strip().split("CUDA Version")[1]
  176. return self.v
  177. return self.v
  178. class AscendEnvChecker(EnvChecker):
  179. """ascend environment check"""
  180. def __init__(self):
  181. self.version = ["1.78.T2.0.B020"]
  182. atlas_nnae_version = "/usr/local/Ascend/nnae/latest/fwkacllib/version.info"
  183. atlas_toolkit_version = "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/version.info"
  184. hisi_fwk_version = "/usr/local/Ascend/fwkacllib/version.info"
  185. if os.path.exists(atlas_nnae_version):
  186. # atlas default path
  187. self.fwk_path = "/usr/local/Ascend/nnae/latest/fwkacllib"
  188. self.op_impl_path = "/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe"
  189. self.tbe_path = self.fwk_path + "/lib64"
  190. self.cce_path = self.fwk_path + "/ccec_compiler/bin"
  191. self.fwk_version = atlas_nnae_version
  192. self.op_path = "/usr/local/Ascend/nnae/latest/opp"
  193. elif os.path.exists(atlas_toolkit_version):
  194. # atlas default path
  195. self.fwk_path = "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib"
  196. self.op_impl_path = "/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe"
  197. self.tbe_path = self.fwk_path + "/lib64"
  198. self.cce_path = self.fwk_path + "/ccec_compiler/bin"
  199. self.fwk_version = atlas_toolkit_version
  200. self.op_path = "/usr/local/Ascend/ascend-toolkit/latest/opp"
  201. elif os.path.exists(hisi_fwk_version):
  202. # hisi default path
  203. self.fwk_path = "/usr/local/Ascend/fwkacllib"
  204. self.op_impl_path = "/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe"
  205. self.tbe_path = self.fwk_path + "/lib64"
  206. self.cce_path = self.fwk_path + "/ccec_compiler/bin"
  207. self.fwk_version = hisi_fwk_version
  208. self.op_path = "/usr/local/Ascend/opp"
  209. else:
  210. # custom or unknown environment
  211. self.fwk_path = ""
  212. self.op_impl_path = ""
  213. self.tbe_path = ""
  214. self.cce_path = ""
  215. self.fwk_version = ""
  216. self.op_path = ""
  217. # env
  218. self.path = os.getenv("PATH")
  219. self.python_path = os.getenv("PYTHONPATH")
  220. self.ld_lib_path = os.getenv("LD_LIBRARY_PATH")
  221. self.ascend_opp_path = os.getenv("ASCEND_OPP_PATH")
  222. # check content
  223. self.path_check = "/fwkacllib/ccec_compiler/bin"
  224. self.python_path_check = "opp/op_impl/built-in/ai_core/tbe"
  225. self.ld_lib_path_check_fwk = "/fwkacllib/lib64"
  226. self.ld_lib_path_check_addons = "/add-ons"
  227. self.ascend_opp_path_check = "/op"
  228. self.v = ""
  229. def check_env(self, e):
  230. self._check_env()
  231. raise e
  232. def check_version(self):
  233. if not Path(self.fwk_version).is_file():
  234. logger.warning("Using custom Ascend 910 AI software package path, package version checking is skipped, "
  235. "please make sure Ascend 910 AI software package version is supported, you can reference to "
  236. "the installation guidelines https://www.mindspore.cn/install")
  237. return
  238. v = self._read_version(self.fwk_version)
  239. if v not in self.version:
  240. v_list = str([x for x in self.version])
  241. logger.warning(f"MindSpore version {__version__} and Ascend 910 AI software package version {v} does not "
  242. f"match, the version of software package expect one of {v_list}, "
  243. "please reference to the match info on: https://www.mindspore.cn/install")
  244. def check_deps_version(self):
  245. """
  246. te, topi, hccl wheel package version check
  247. in order to update the change of 'LD_LIBRARY_PATH' env, run a sub process
  248. """
  249. input_args = ["--mindspore_version=" + __version__]
  250. for v in self.version:
  251. input_args.append("--supported_version=" + v)
  252. deps_version_checker = os.path.join(os.path.split(os.path.realpath(__file__))[0], "_check_deps_version.py")
  253. call_cmd = [sys.executable, deps_version_checker] + input_args
  254. try:
  255. process = subprocess.run(call_cmd, timeout=3, text=True, capture_output=True, check=False)
  256. if process.stdout.strip() != "":
  257. logger.warning(process.stdout.strip())
  258. except subprocess.TimeoutExpired:
  259. logger.info("Package te, topi, hccl version check timed out, skip.")
  260. def set_env(self):
  261. if not self.tbe_path:
  262. self._check_env()
  263. return
  264. try:
  265. import te
  266. except Exception:
  267. if Path(self.tbe_path).is_dir():
  268. if os.getenv('LD_LIBRARY_PATH'):
  269. os.environ['LD_LIBRARY_PATH'] = self.tbe_path + ":" + os.environ['LD_LIBRARY_PATH']
  270. else:
  271. os.environ['LD_LIBRARY_PATH'] = self.tbe_path
  272. else:
  273. raise EnvironmentError(
  274. f"No such directory: {self.tbe_path}, Please check if Ascend 910 AI software package is "
  275. "installed correctly.")
  276. # check te version after set te env
  277. self.check_deps_version()
  278. if Path(self.op_impl_path).is_dir():
  279. # python path for sub process
  280. if os.getenv('PYTHONPATH'):
  281. os.environ['PYTHONPATH'] = self.op_impl_path + ":" + os.environ['PYTHONPATH']
  282. else:
  283. os.environ['PYTHONPATH'] = self.op_impl_path
  284. # sys path for this process
  285. sys.path.append(self.op_impl_path)
  286. os.environ['TBE_IMPL_PATH'] = self.op_impl_path
  287. else:
  288. raise EnvironmentError(
  289. f"No such directory: {self.op_impl_path}, Please check if Ascend 910 AI software package is "
  290. "installed correctly.")
  291. if Path(self.cce_path).is_dir():
  292. os.environ['PATH'] = self.cce_path + ":" + os.environ['PATH']
  293. else:
  294. raise EnvironmentError(
  295. f"No such directory: {self.cce_path}, Please check if Ascend 910 AI software package is "
  296. "installed correctly.")
  297. if self.op_path is None:
  298. pass
  299. elif Path(self.op_path).is_dir():
  300. os.environ['ASCEND_OPP_PATH'] = self.op_path
  301. else:
  302. raise EnvironmentError(
  303. f"No such directory: {self.op_path}, Please check if Ascend 910 AI software package is "
  304. "installed correctly.")
  305. def _check_env(self):
  306. """ascend dependence path check"""
  307. if self.path is None or self.path_check not in self.path:
  308. logger.warning("Can not find ccec_compiler(need by mindspore-ascend), please check if you have set env "
  309. "PATH, you can reference to the installation guidelines https://www.mindspore.cn/install")
  310. if self.python_path is None or self.python_path_check not in self.python_path:
  311. logger.warning(
  312. "Can not find tbe op implement(need by mindspore-ascend), please check if you have set env "
  313. "PYTHONPATH, you can reference to the installation guidelines "
  314. "https://www.mindspore.cn/install")
  315. if self.ld_lib_path is None or not (self.ld_lib_path_check_fwk in self.ld_lib_path and
  316. self.ld_lib_path_check_addons in self.ld_lib_path):
  317. logger.warning("Can not find driver so(need by mindspore-ascend), please check if you have set env "
  318. "LD_LIBRARY_PATH, you can reference to the installation guidelines "
  319. "https://www.mindspore.cn/install")
  320. if self.ascend_opp_path is None or self.ascend_opp_path_check not in self.ascend_opp_path:
  321. logger.warning(
  322. "Can not find opp path (need by mindspore-ascend), please check if you have set env ASCEND_OPP_PATH, "
  323. "you can reference to the installation guidelines https://www.mindspore.cn/install")
  324. def _read_version(self, file_path):
  325. """get ascend version info"""
  326. with open(file_path, 'r') as f:
  327. all_info = f.readlines()
  328. for line in all_info:
  329. if line.startswith("Version="):
  330. self.v = line.strip().split("=")[1]
  331. return self.v
  332. return self.v
  333. def check_version_and_env_config():
  334. """check version and env config"""
  335. if __package_name__.lower() == "mindspore-ascend":
  336. env_checker = AscendEnvChecker()
  337. elif __package_name__.lower() == "mindspore-gpu":
  338. env_checker = GPUEnvChecker()
  339. else:
  340. logger.info(f"Package version {__package_name__} does not need to check any environment variable, skipping.")
  341. return
  342. try:
  343. from . import _c_expression
  344. # check version of ascend site or cuda
  345. env_checker.check_version()
  346. env_checker.set_env()
  347. except ImportError as e:
  348. env_checker.check_env(e)
  349. def _set_pb_env():
  350. """Set env variable `PROTOCOL_BUFFERS` to prevent memory overflow."""
  351. if os.getenv("PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION") == "cpp":
  352. logger.info("Current env variable `PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=cpp`. "
  353. "When the checkpoint file is too large, "
  354. "it may cause memory limit error during load checkpoint file. "
  355. "This can be solved by set env `PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python`.")
  356. elif os.getenv("PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION") is None:
  357. logger.info("Setting the env `PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python` to prevent memory overflow "
  358. "during save or load checkpoint file.")
  359. os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
  360. check_version_and_env_config()
  361. _set_pb_env()