You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

_check_version.py 17 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """version and config check"""
  16. import os
  17. import sys
  18. import subprocess
  19. from pathlib import Path
  20. from abc import abstractmethod, ABCMeta
  21. import numpy as np
  22. from packaging import version
  23. from . import log as logger
  24. from .version import __version__
  25. from .default_config import __package_name__
  26. class EnvChecker(metaclass=ABCMeta):
  27. """basic class for environment check"""
  28. @abstractmethod
  29. def check_env(self, e):
  30. pass
  31. @abstractmethod
  32. def set_env(self):
  33. pass
  34. @abstractmethod
  35. def check_version(self):
  36. pass
  37. class GPUEnvChecker(EnvChecker):
  38. """GPU environment check."""
  39. def __init__(self):
  40. self.version = ["10.1", "11.1"]
  41. self.lib_key_to_lib_name = {'libcu': 'libcuda.so'}
  42. # env
  43. self.path = os.getenv("PATH")
  44. self.ld_lib_path = os.getenv("LD_LIBRARY_PATH")
  45. # check
  46. self.v = "0"
  47. self.cuda_lib_path = self._get_lib_path("libcu")
  48. self.cuda_bin_path = self._get_bin_path("cuda")
  49. def check_env(self, e):
  50. raise e
  51. def set_env(self):
  52. return
  53. def _get_bin_path(self, bin_name):
  54. """Get bin path by bin name."""
  55. if bin_name == "cuda":
  56. return self._get_cuda_bin_path()
  57. return []
  58. def _get_cuda_bin_path(self):
  59. """Get cuda bin path by lib path."""
  60. path_list = []
  61. for path in self.cuda_lib_path:
  62. path = os.path.abspath(path.strip()+"/bin/")
  63. if Path(path).is_dir():
  64. path_list.append(path)
  65. return np.unique(path_list)
  66. def _get_nvcc_version(self, is_set_env):
  67. """Get cuda version by nvcc command."""
  68. nvcc_result = subprocess.run(["nvcc --version | grep release"],
  69. timeout=3, text=True, capture_output=True, check=False, shell=True)
  70. if nvcc_result.returncode:
  71. if not is_set_env:
  72. for path in self.cuda_bin_path:
  73. if Path(path + "/nvcc").is_file():
  74. os.environ['PATH'] = path + ":" + os.environ['PATH']
  75. return self._get_nvcc_version(True)
  76. return ""
  77. result = nvcc_result.stdout
  78. for line in result.split('\n'):
  79. if line:
  80. return line.strip().split("release")[1].split(",")[0].strip()
  81. return ""
  82. def check_version(self):
  83. """Check cuda version."""
  84. version_match = False
  85. for path in self.cuda_lib_path:
  86. version_file = path + "/version.txt"
  87. if not Path(version_file).is_file():
  88. continue
  89. if self._check_version(version_file):
  90. version_match = True
  91. break
  92. if not version_match:
  93. if self.v == "0":
  94. logger.warning("Cuda version file version.txt is not found, please confirm that the correct "
  95. "cuda version has been installed, you can refer to the "
  96. "installation guidelines: https://www.mindspore.cn/install")
  97. else:
  98. logger.warning(f"MindSpore version {__version__} and cuda version {self.v} does not match, "
  99. "please refer to the installation guide for version matching "
  100. "information: https://www.mindspore.cn/install")
  101. nvcc_version = self._get_nvcc_version(False)
  102. if nvcc_version and (nvcc_version not in self.version):
  103. logger.warning(f"MindSpore version {__version__} and nvcc(cuda bin) version {nvcc_version} "
  104. "does not match, please refer to the installation guide for version matching "
  105. "information: https://www.mindspore.cn/install")
  106. def _check_version(self, version_file):
  107. """Check cuda version by version.txt."""
  108. v = self._read_version(version_file)
  109. v = version.parse(v)
  110. v_str = str(v.major) + "." + str(v.minor)
  111. if v_str not in self.version:
  112. return False
  113. return True
  114. def _get_lib_path(self, lib_name):
  115. """Get gpu lib path by ldd command."""
  116. path_list = []
  117. current_path = os.path.split(os.path.realpath(__file__))[0]
  118. try:
  119. ldd_result = subprocess.run(["ldd " + current_path + "/_c_expression*.so* | grep " + lib_name],
  120. timeout=10, text=True, capture_output=True, check=False, shell=True)
  121. if ldd_result.returncode:
  122. logger.error(f"{self.lib_key_to_lib_name[lib_name]} (need by mindspore-gpu) is not found, please "
  123. f"confirm that _c_expression.so is in directory:{current_path} and the correct cuda "
  124. "version has been installed, you can refer to the installation "
  125. "guidelines: https://www.mindspore.cn/install")
  126. return path_list
  127. result = ldd_result.stdout
  128. for i in result.split('\n'):
  129. path = i.partition("=>")[2]
  130. if path.lower().find("not found") > 0:
  131. logger.warning(f"Cuda {self.version} version(need by mindspore-gpu) is not found, please confirm "
  132. "that the path of cuda is set to the env LD_LIBRARY_PATH, please refer to the "
  133. "installation guidelines: https://www.mindspore.cn/install")
  134. continue
  135. path = path.partition(lib_name)[0]
  136. if path:
  137. path_list.append(os.path.abspath(path.strip() + "../"))
  138. return np.unique(path_list)
  139. except subprocess.TimeoutExpired:
  140. logger.warning("Failed to check cuda version due to the ldd command timeout, please confirm that "
  141. "the correct cuda version has been installed, you can refer to the "
  142. "installation guidelines: https://www.mindspore.cn/install")
  143. return path_list
  144. def _read_version(self, file_path):
  145. """Get gpu version info in version.txt."""
  146. with open(file_path, 'r') as f:
  147. all_info = f.readlines()
  148. for line in all_info:
  149. if line.startswith("CUDA Version"):
  150. self.v = line.strip().split("CUDA Version")[1]
  151. return self.v
  152. return self.v
  153. class AscendEnvChecker(EnvChecker):
  154. """ascend environment check"""
  155. def __init__(self):
  156. self.version = ["1.77.22.3.220"]
  157. atlas_nnae_version = "/usr/local/Ascend/nnae/latest/fwkacllib/version.info"
  158. atlas_toolkit_version = "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/version.info"
  159. hisi_fwk_version = "/usr/local/Ascend/fwkacllib/version.info"
  160. if os.path.exists(atlas_nnae_version):
  161. # atlas default path
  162. self.fwk_path = "/usr/local/Ascend/nnae/latest/fwkacllib"
  163. self.op_impl_path = "/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe"
  164. self.tbe_path = self.fwk_path + "/lib64"
  165. self.cce_path = self.fwk_path + "/ccec_compiler/bin"
  166. self.fwk_version = atlas_nnae_version
  167. self.op_path = "/usr/local/Ascend/nnae/latest/opp"
  168. elif os.path.exists(atlas_toolkit_version):
  169. # atlas default path
  170. self.fwk_path = "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib"
  171. self.op_impl_path = "/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe"
  172. self.tbe_path = self.fwk_path + "/lib64"
  173. self.cce_path = self.fwk_path + "/ccec_compiler/bin"
  174. self.fwk_version = atlas_toolkit_version
  175. self.op_path = "/usr/local/Ascend/ascend-toolkit/latest/opp"
  176. elif os.path.exists(hisi_fwk_version):
  177. # hisi default path
  178. self.fwk_path = "/usr/local/Ascend/fwkacllib"
  179. self.op_impl_path = "/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe"
  180. self.tbe_path = self.fwk_path + "/lib64"
  181. self.cce_path = self.fwk_path + "/ccec_compiler/bin"
  182. self.fwk_version = hisi_fwk_version
  183. self.op_path = "/usr/local/Ascend/opp"
  184. else:
  185. # custom or unknown environment
  186. self.fwk_path = ""
  187. self.op_impl_path = ""
  188. self.tbe_path = ""
  189. self.cce_path = ""
  190. self.fwk_version = ""
  191. self.op_path = ""
  192. # env
  193. self.path = os.getenv("PATH")
  194. self.python_path = os.getenv("PYTHONPATH")
  195. self.ld_lib_path = os.getenv("LD_LIBRARY_PATH")
  196. self.ascend_opp_path = os.getenv("ASCEND_OPP_PATH")
  197. # check content
  198. self.path_check = "/fwkacllib/ccec_compiler/bin"
  199. self.python_path_check = "opp/op_impl/built-in/ai_core/tbe"
  200. self.ld_lib_path_check_fwk = "/fwkacllib/lib64"
  201. self.ld_lib_path_check_addons = "/add-ons"
  202. self.ascend_opp_path_check = "/op"
  203. self.v = ""
  204. def check_env(self, e):
  205. self._check_env()
  206. raise e
  207. def check_version(self):
  208. if not Path(self.fwk_version).is_file():
  209. logger.warning("Using custom Ascend 910 AI software package path, package version checking is skipped, "
  210. "please make sure Ascend 910 AI software package version is supported, you can reference to "
  211. "the installation guidelines https://www.mindspore.cn/install")
  212. return
  213. v = self._read_version(self.fwk_version)
  214. if v not in self.version:
  215. v_list = str([x for x in self.version])
  216. logger.warning(f"MindSpore version {__version__} and Ascend 910 AI software package version {v} does not "
  217. f"match, the version of software package expect one of {v_list}, "
  218. "please reference to the match info on: https://www.mindspore.cn/install")
  219. def check_deps_version(self):
  220. """
  221. te, topi, hccl wheel package version check
  222. in order to update the change of 'LD_LIBRARY_PATH' env, run a sub process
  223. """
  224. input_args = ["--mindspore_version=" + __version__]
  225. for v in self.version:
  226. input_args.append("--supported_version=" + v)
  227. deps_version_checker = os.path.join(os.path.split(os.path.realpath(__file__))[0], "_check_deps_version.py")
  228. call_cmd = [sys.executable, deps_version_checker] + input_args
  229. try:
  230. process = subprocess.run(call_cmd, timeout=3, text=True, capture_output=True, check=False)
  231. if process.stdout.strip() != "":
  232. logger.warning(process.stdout.strip())
  233. except subprocess.TimeoutExpired:
  234. logger.info("Package te, topi, hccl version check timed out, skip.")
  235. def set_env(self):
  236. if not self.tbe_path:
  237. self._check_env()
  238. return
  239. try:
  240. # pylint: disable=unused-import
  241. import te
  242. # pylint: disable=broad-except
  243. except Exception:
  244. if Path(self.tbe_path).is_dir():
  245. if os.getenv('LD_LIBRARY_PATH'):
  246. os.environ['LD_LIBRARY_PATH'] = self.tbe_path + ":" + os.environ['LD_LIBRARY_PATH']
  247. else:
  248. os.environ['LD_LIBRARY_PATH'] = self.tbe_path
  249. else:
  250. raise EnvironmentError(
  251. f"No such directory: {self.tbe_path}, Please check if Ascend 910 AI software package is "
  252. "installed correctly.")
  253. # check te version after set te env
  254. self.check_deps_version()
  255. if Path(self.op_impl_path).is_dir():
  256. # python path for sub process
  257. if os.getenv('PYTHONPATH'):
  258. os.environ['PYTHONPATH'] = self.op_impl_path + ":" + os.environ['PYTHONPATH']
  259. else:
  260. os.environ['PYTHONPATH'] = self.op_impl_path
  261. # sys path for this process
  262. sys.path.append(self.op_impl_path)
  263. os.environ['TBE_IMPL_PATH'] = self.op_impl_path
  264. else:
  265. raise EnvironmentError(
  266. f"No such directory: {self.op_impl_path}, Please check if Ascend 910 AI software package is "
  267. "installed correctly.")
  268. if Path(self.cce_path).is_dir():
  269. os.environ['PATH'] = self.cce_path + ":" + os.environ['PATH']
  270. else:
  271. raise EnvironmentError(
  272. f"No such directory: {self.cce_path}, Please check if Ascend 910 AI software package is "
  273. "installed correctly.")
  274. if self.op_path is None:
  275. pass
  276. elif Path(self.op_path).is_dir():
  277. os.environ['ASCEND_OPP_PATH'] = self.op_path
  278. else:
  279. raise EnvironmentError(
  280. f"No such directory: {self.op_path}, Please check if Ascend 910 AI software package is "
  281. "installed correctly.")
  282. def _check_env(self):
  283. """ascend dependence path check"""
  284. if self.path is None or self.path_check not in self.path:
  285. logger.warning("Can not find ccec_compiler(need by mindspore-ascend), please check if you have set env "
  286. "PATH, you can reference to the installation guidelines https://www.mindspore.cn/install")
  287. if self.python_path is None or self.python_path_check not in self.python_path:
  288. logger.warning(
  289. "Can not find tbe op implement(need by mindspore-ascend), please check if you have set env "
  290. "PYTHONPATH, you can reference to the installation guidelines "
  291. "https://www.mindspore.cn/install")
  292. if self.ld_lib_path is None or not (self.ld_lib_path_check_fwk in self.ld_lib_path and
  293. self.ld_lib_path_check_addons in self.ld_lib_path):
  294. logger.warning("Can not find driver so(need by mindspore-ascend), please check if you have set env "
  295. "LD_LIBRARY_PATH, you can reference to the installation guidelines "
  296. "https://www.mindspore.cn/install")
  297. if self.ascend_opp_path is None or self.ascend_opp_path_check not in self.ascend_opp_path:
  298. logger.warning(
  299. "Can not find opp path (need by mindspore-ascend), please check if you have set env ASCEND_OPP_PATH, "
  300. "you can reference to the installation guidelines https://www.mindspore.cn/install")
  301. def _read_version(self, file_path):
  302. """get ascend version info"""
  303. with open(file_path, 'r') as f:
  304. all_info = f.readlines()
  305. for line in all_info:
  306. if line.startswith("Version="):
  307. self.v = line.strip().split("=")[1]
  308. return self.v
  309. return self.v
  310. def check_version_and_env_config():
  311. """check version and env config"""
  312. if __package_name__.lower() == "mindspore-ascend":
  313. env_checker = AscendEnvChecker()
  314. elif __package_name__.lower() == "mindspore-gpu":
  315. env_checker = GPUEnvChecker()
  316. else:
  317. logger.info(f"Package version {__package_name__} does not need to check any environment variable, skipping.")
  318. return
  319. try:
  320. # pylint: disable=unused-import
  321. from . import _c_expression
  322. # check version of ascend site or cuda
  323. env_checker.check_version()
  324. env_checker.set_env()
  325. except ImportError as e:
  326. env_checker.check_env(e)
  327. def _set_pb_env():
  328. """Set env variable `PROTOCOL_BUFFERS` to prevent memory overflow."""
  329. if os.getenv("PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION") == "cpp":
  330. logger.info("Current env variable `PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=cpp`. "
  331. "When the checkpoint file is too large, "
  332. "it may cause memory limit error during load checkpoint file. "
  333. "This can be solved by set env `PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python`.")
  334. elif os.getenv("PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION") is None:
  335. logger.info("Setting the env `PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python` to prevent memory overflow "
  336. "during save or load checkpoint file.")
  337. os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
  338. check_version_and_env_config()
  339. _set_pb_env()