You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_common.h 19 kB

5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_COMMON_H_
  17. #define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_COMMON_H_
  18. #include <cublas_v2.h>
  19. #include <iostream>
  20. #include <vector>
  21. #include <string>
  22. #include <algorithm>
  23. #include <map>
  24. #include <sstream>
  25. #include "utils/log_adapter.h"
  26. #include "utils/trace_base.h"
  27. #include "include/curand.h"
  28. namespace mindspore {
  29. namespace device {
  30. namespace gpu {
  31. #define CHECK_OP_RET_WITH_EXCEPT(expression, message) \
  32. { \
  33. bool success = (expression); \
  34. if (!success) { \
  35. MS_LOG(EXCEPTION) << "Op Error: " << message << " | Error Number: " << success; \
  36. } \
  37. }
  38. #define CHECK_OP_RET_WITH_ERROR(expression, message) \
  39. { \
  40. bool success = (expression); \
  41. if (!success) { \
  42. MS_LOG(ERROR) << "Op Error: " << message << " | Error Number: " << success; \
  43. } \
  44. }
  45. #define CHECK_RET_WITH_RETURN_ERROR(expression, message) \
  46. { \
  47. bool success = (expression); \
  48. if (!success) { \
  49. MS_LOG(ERROR) << message; \
  50. return false; \
  51. } \
  52. }
  53. #define CHECK_CUDA_RET_WITH_ERROR(node, expression, message) \
  54. { \
  55. cudaError_t status = (expression); \
  56. if (status != cudaSuccess) { \
  57. MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " << cudaGetErrorString(status) \
  58. << trace::DumpSourceLines(node.lock()); \
  59. } \
  60. }
  61. #define CHECK_CUDA_RET_WITH_ERROR_NOTRACE(expression, message) \
  62. { \
  63. cudaError_t status = (expression); \
  64. if (status != cudaSuccess) { \
  65. MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " \
  66. << cudaGetErrorString(status); \
  67. } \
  68. }
  69. #define CHECK_CUDA_RET_WITH_RETURN_ERROR_NOTRACE(expression, message) \
  70. { \
  71. cudaError_t status = (expression); \
  72. if (status != cudaSuccess) { \
  73. MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " \
  74. << cudaGetErrorString(status); \
  75. return false; \
  76. } \
  77. }
  78. #define CHECK_CUDA_RET_WITH_EXCEPT(node, expression, message) \
  79. { \
  80. cudaError_t status = (expression); \
  81. if (status != cudaSuccess) { \
  82. MS_LOG(EXCEPTION) << "CUDA Error: " << message << " | Error Number: " << status << " " \
  83. << cudaGetErrorString(status) << trace::DumpSourceLines(node.lock()); \
  84. } \
  85. }
  86. #define CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(expression, message) \
  87. { \
  88. cudaError_t status = (expression); \
  89. if (status != cudaSuccess) { \
  90. MS_LOG(EXCEPTION) << "CUDA Error: " << message << " | Error Number: " << status << " " \
  91. << cudaGetErrorString(status); \
  92. } \
  93. }
  94. #define CHECK_CUDNN_RET_WITH_EXCEPT(node, expression, message) \
  95. { \
  96. cudnnStatus_t status = (expression); \
  97. if (status != CUDNN_STATUS_SUCCESS) { \
  98. MS_LOG(EXCEPTION) << "cuDNN Error: " << message << " | Error Number: " << status << " " \
  99. << cudnnGetErrorString(status) << trace::DumpSourceLines(node.lock()); \
  100. } \
  101. }
  102. #define CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(expression, message) \
  103. { \
  104. cudnnStatus_t status = (expression); \
  105. if (status != CUDNN_STATUS_SUCCESS) { \
  106. MS_LOG(EXCEPTION) << "cuDNN Error: " << message << " | Error Number: " << status << " " \
  107. << cudnnGetErrorString(status); \
  108. } \
  109. }
  110. #define CHECK_CUDNN_RET_WITH_ERROR_NOTRACE(expression, message) \
  111. { \
  112. cudnnStatus_t status = (expression); \
  113. if (status != CUDNN_STATUS_SUCCESS) { \
  114. MS_LOG(ERROR) << "cuDNN Error: " << message << " | Error Number: " << status << " " \
  115. << cudnnGetErrorString(status); \
  116. } \
  117. }
  118. #define CHECK_CUDNN_RET_WITH_ERROR(node, expression, message) \
  119. { \
  120. cudnnStatus_t status = (expression); \
  121. if (status != CUDNN_STATUS_SUCCESS) { \
  122. MS_LOG(ERROR) << "cuDNN Error: " << message << " | Error Number: " << status << " " \
  123. << cudnnGetErrorString(status) << trace::DumpSourceLines(node.lock()); \
  124. } \
  125. }
  126. #define CHECK_CUBLAS_RET_WITH_EXCEPT_NOTRACE(expression, message) \
  127. { \
  128. cublasStatus_t status = (expression); \
  129. if (status != CUBLAS_STATUS_SUCCESS) { \
  130. MS_LOG(EXCEPTION) << "cuBLAS Error: " << message << " | Error Number: " << status << " " \
  131. << mindspore::device::gpu::cuBlasGetErrorString(status); \
  132. } \
  133. }
  134. #define CHECK_CUBLAS_RET_WITH_EXCEPT(node, expression, message) \
  135. { \
  136. cublasStatus_t status = (expression); \
  137. if (status != CUBLAS_STATUS_SUCCESS) { \
  138. MS_LOG(EXCEPTION) << "cuBLAS Error: " << message << " | Error Number: " << status << " " \
  139. << mindspore::device::gpu::cuBlasGetErrorString(status) \
  140. << trace::DumpSourceLines(node.lock()); \
  141. } \
  142. }
  143. #define CHECK_CUBLAS_RET_WITH_ERROR(expression, message) \
  144. { \
  145. cublasStatus_t status = (expression); \
  146. if (status != CUBLAS_STATUS_SUCCESS) { \
  147. MS_LOG(ERROR) << "cuBLAS Error: " << message << " | Error Number: " << status << " " \
  148. << mindspore::device::gpu::cuBlasGetErrorString(status); \
  149. } \
  150. }
  151. #define CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(expression, message) \
  152. { \
  153. cusolverStatus_t status = (expression); \
  154. if (status != CUSOLVER_STATUS_SUCCESS) { \
  155. MS_LOG(EXCEPTION) << "cusolver Error: " << message << " | Error Number: " << status; \
  156. } \
  157. }
  158. #define CHECK_CUSOLVER_RET_WITH_EXCEPT(node, expression, message) \
  159. { \
  160. cusolverStatus_t status = (expression); \
  161. if (status != CUSOLVER_STATUS_SUCCESS) { \
  162. MS_LOG(EXCEPTION) << "cusolver Error: " << message << " | Error Number: " << status \
  163. << trace::DumpSourceLines(node.lock()); \
  164. ; \
  165. } \
  166. }
  167. #define CHECK_CUSOLVER_RET_WITH_ERROR(expression, message) \
  168. { \
  169. cusolverStatus_t status = (expression); \
  170. if (status != CUSOLVER_STATUS_SUCCESS) { \
  171. MS_LOG(ERROR) << "cusolver Error: " << message << " | Error Number: " << status; \
  172. } \
  173. }
  174. #define CHECK_NCCL_RET_WITH_EXCEPT(node, expression, message) \
  175. { \
  176. int result = (expression); \
  177. if (result != ncclSuccess) { \
  178. MS_LOG(EXCEPTION) << "NCCL Error: " << message << " | Error Number: " << result \
  179. << trace::DumpSourceLines(node.lock()); \
  180. } \
  181. }
  182. #define VARIABLE_NOT_USED(var) \
  183. { (void)(var); }
  184. inline bool CheckNullInput(const std::vector<size_t> &input_shape) {
  185. // If input_shape.size() == 0, it means a scalar input; If input_shape.size() != 0 and input_shape contains 0,
  186. // it means a null input. Just return a null output.
  187. if (input_shape.size() != 0) {
  188. if (std::any_of(input_shape.begin(), input_shape.end(), [](size_t i) { return i == 0; })) {
  189. return true;
  190. }
  191. }
  192. return false;
  193. }
  194. #define CHECK_NULL_INPUT(input_shape) mindspore::device::gpu::CheckNullInput(input_shape)
  195. template <typename T>
  196. inline std::string ConvertVectorToString(const std::vector<T> &value) {
  197. std::stringstream ss;
  198. ss << "(";
  199. for (auto it = value.begin(); it != value.end(); it++) {
  200. if (it == value.begin()) {
  201. ss << *it;
  202. } else {
  203. ss << ", " << *it;
  204. }
  205. }
  206. ss << ")";
  207. return ss.str();
  208. }
  209. #define CONVERT_VECTOR_TO_STRING(value) mindspore::device::gpu::ConvertVectorToString(value)
  210. inline bool CheckShapeNull(const std::vector<size_t> &shape, std::string kernel_name, std::string param_name) {
  211. if (CHECK_NULL_INPUT(shape)) {
  212. MS_LOG(WARNING) << "For '" << kernel_name << "', the shape of " << param_name << " cannot contain zero, but got "
  213. << CONVERT_VECTOR_TO_STRING(shape);
  214. return true;
  215. }
  216. return false;
  217. }
  218. #define CHECK_SHAPE_NULL(shape, kernel_name, param_name) \
  219. mindspore::device::gpu::CheckShapeNull(shape, kernel_name, param_name)
  220. inline const char *CurandGetErrorString(curandStatus_t status) {
  221. switch (status) {
  222. case CURAND_STATUS_VERSION_MISMATCH:
  223. return "Header file and linked library version do not match.";
  224. case CURAND_STATUS_NOT_INITIALIZED:
  225. return "Generator not initialized.";
  226. case CURAND_STATUS_ALLOCATION_FAILED:
  227. return "Memory allocation failed.";
  228. case CURAND_STATUS_TYPE_ERROR:
  229. return "Generator is wrong type.";
  230. case CURAND_STATUS_OUT_OF_RANGE:
  231. return "Argument out of range.";
  232. case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
  233. return "Length requested is not a multiple of dimension.";
  234. case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
  235. return "GPU does not have double precision required by MRG32k3a.";
  236. case CURAND_STATUS_LAUNCH_FAILURE:
  237. return "Kernel launch failure.";
  238. case CURAND_STATUS_PREEXISTING_FAILURE:
  239. return "Preexisting failure on library entry.";
  240. case CURAND_STATUS_INITIALIZATION_FAILED:
  241. return "Initialization of CUDA failed.";
  242. case CURAND_STATUS_ARCH_MISMATCH:
  243. return "Architecture mismatch, GPU does not support requested feature.";
  244. case CURAND_STATUS_INTERNAL_ERROR:
  245. return "Internal library error.";
  246. default:
  247. return "Unknown the curandStatus.";
  248. }
  249. }
  250. inline const char *cuBlasGetErrorString(cublasStatus_t status) {
  251. switch (status) {
  252. case CUBLAS_STATUS_SUCCESS:
  253. return "CUBLAS_STATUS_SUCCESS: The operation completed successfully.";
  254. case CUBLAS_STATUS_NOT_INITIALIZED:
  255. return "CUBLAS_STATUS_NOT_INITIALIZED: The cuBLAS library was not initialized.";
  256. case CUBLAS_STATUS_ALLOC_FAILED:
  257. return "CUBLAS_STATUS_ALLOC_FAILED: Resource allocation failed inside the cuBLAS library. This is usually caused "
  258. "by a cudaMalloc() failure. ";
  259. case CUBLAS_STATUS_INVALID_VALUE:
  260. return "CUBLAS_STATUS_INVALID_VALUE: An unsupported value or parameter was passed to the function (a negative "
  261. "vector size, for example).";
  262. case CUBLAS_STATUS_ARCH_MISMATCH:
  263. return "CUBLAS_STATUS_ARCH_MISMATCH: The function requires a feature absent from the device architecture; "
  264. "usually caused by compute capability lower than 5.0.";
  265. case CUBLAS_STATUS_MAPPING_ERROR:
  266. return "CUBLAS_STATUS_MAPPING_ERROR: An access to GPU memory space failed, which is usually caused by a failure "
  267. "to bind a texture.";
  268. case CUBLAS_STATUS_EXECUTION_FAILED:
  269. return "CUBLAS_STATUS_EXECUTION_FAILED: The GPU program failed to execute. This is often caused by a launch "
  270. "failure of the kernel on the GPU, which can be caused by multiple reasons.";
  271. case CUBLAS_STATUS_INTERNAL_ERROR:
  272. return "CUBLAS_STATUS_INTERNAL_ERROR: An internal cuBLAS operation failed. This error is usually caused by a "
  273. "cudaMemcpyAsync() failure. ";
  274. case CUBLAS_STATUS_NOT_SUPPORTED:
  275. return "CUBLAS_STATUS_NOT_SUPPORTED: The functionality requested is not supported.";
  276. case CUBLAS_STATUS_LICENSE_ERROR:
  277. return "CUBLAS_STATUS_LICENSE_ERROR: The functionality requested requires some license and an error was detected "
  278. "when trying to check the current licensing. This error can happen if the license is not present or is "
  279. "expired or if the environment variable NVIDIA_LICENSE_FILE is not set properly. ";
  280. default:
  281. return "Unknown cublasStatus.";
  282. }
  283. }
  284. #define CHECK_CURAND_RET_WITH_EXCEPT(expression, message) \
  285. { \
  286. curandStatus_t status = (expression); \
  287. if (status != CURAND_STATUS_SUCCESS) { \
  288. MS_LOG(EXCEPTION) << "CUDA curand Error: " << message << " | curandStatus: " << status << " " \
  289. << mindspore::device::gpu::CurandGetErrorString(status); \
  290. } \
  291. }
  292. } // namespace gpu
  293. } // namespace device
  294. } // namespace mindspore
  295. #endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_COMMON_H_