From: @VectorSL Reviewed-by: @cristoval Signed-off-by:pull/13417/MERGE
| @@ -357,15 +357,11 @@ checkopts() | |||||
| if [[ "X$DEVICE_VERSION" == "X" ]]; then | if [[ "X$DEVICE_VERSION" == "X" ]]; then | ||||
| DEVICE_VERSION=10.1 | DEVICE_VERSION=10.1 | ||||
| fi | fi | ||||
| if [[ "X$DEVICE_VERSION" != "X9.2" && "X$DEVICE_VERSION" != "X10.1" ]]; then | |||||
| if [[ "X$DEVICE_VERSION" != "X11.1" && "X$DEVICE_VERSION" != "X10.1" ]]; then | |||||
| echo "Invalid value ${DEVICE_VERSION} for option -V" | echo "Invalid value ${DEVICE_VERSION} for option -V" | ||||
| usage | usage | ||||
| exit 1 | exit 1 | ||||
| fi | fi | ||||
| if [[ "X$DEVICE_VERSION" == "X9.2" ]]; then | |||||
| echo "Unsupported CUDA version 9.2" | |||||
| exit 1 | |||||
| fi | |||||
| CUDA_VERSION="$DEVICE_VERSION" | CUDA_VERSION="$DEVICE_VERSION" | ||||
| elif [[ "X$DEVICE" == "Xd" || "X$DEVICE" == "Xascend" ]]; then | elif [[ "X$DEVICE" == "Xd" || "X$DEVICE" == "Xascend" ]]; then | ||||
| # version default 910 | # version default 910 | ||||
| @@ -45,7 +45,7 @@ class GPUEnvChecker(EnvChecker): | |||||
| """GPU environment check.""" | """GPU environment check.""" | ||||
| def __init__(self): | def __init__(self): | ||||
| self.version = ["10.1"] | |||||
| self.version = ["10.1", "11.1"] | |||||
| self.lib_key_to_lib_name = {'libcu': 'libcuda.so'} | self.lib_key_to_lib_name = {'libcu': 'libcuda.so'} | ||||
| # env | # env | ||||
| self.path = os.getenv("PATH") | self.path = os.getenv("PATH") | ||||
| @@ -127,7 +127,11 @@ if(ENABLE_GPU) | |||||
| endif() | endif() | ||||
| set(NVCC_TMP_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) | set(NVCC_TMP_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) | ||||
| string(REPLACE "-std=c++17" "-std=c++11" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") | |||||
| if(${CUDA_VERSION} VERSION_LESS 11.0) | |||||
| string(REPLACE "-std=c++17" "-std=c++11" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") | |||||
| else() | |||||
| string(REPLACE "-std=c++17" "-std=c++14" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") | |||||
| endif() | |||||
| set_property(SOURCE ${GPU_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE) | set_property(SOURCE ${GPU_SRC_LIST} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE) | ||||
| cuda_add_library(gpu_cuda_lib STATIC ${GPU_SRC_LIST}) | cuda_add_library(gpu_cuda_lib STATIC ${GPU_SRC_LIST}) | ||||
| set(CMAKE_CXX_FLAGS ${NVCC_TMP_CMAKE_CXX_FLAGS}) | set(CMAKE_CXX_FLAGS ${NVCC_TMP_CMAKE_CXX_FLAGS}) | ||||
| @@ -310,23 +310,15 @@ class Conv2dGpuFwdKernel : public GpuKernel { | |||||
| "cudnnSetTensor4dDescriptor failed"); | "cudnnSetTensor4dDescriptor failed"); | ||||
| } | } | ||||
| void SelectAlgorithm(cudnnTensorDescriptor_t input_descriptor_real) { | void SelectAlgorithm(cudnnTensorDescriptor_t input_descriptor_real) { | ||||
| if (group_ > 1 || CUDNN_MAJOR < 7) { | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | |||||
| cudnnGetConvolutionForwardAlgorithm( | |||||
| cudnn_handle_, input_descriptor_real, filter_desc_, conv_desc_, output_desc_, | |||||
| CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, 0, &conv_algorithm_), | |||||
| "cudnnGetConvolutionForwardAlgorithm failed"); | |||||
| } else { | |||||
| constexpr int requested_algo_count = 1; | |||||
| int returned_algo_count; | |||||
| cudnnConvolutionFwdAlgoPerf_t perf_results; | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||||
| kernel_node_, | |||||
| cudnnGetConvolutionForwardAlgorithm_v7(cudnn_handle_, input_descriptor_real, filter_desc_, conv_desc_, | |||||
| output_desc_, requested_algo_count, &returned_algo_count, &perf_results), | |||||
| "cudnnGetConvolutionForwardAlgorithm_v7 failed"); | |||||
| conv_algorithm_ = perf_results.algo; | |||||
| } | |||||
| constexpr int requested_algo_count = 1; | |||||
| int returned_algo_count = 0; | |||||
| cudnnConvolutionFwdAlgoPerf_t perf_results; | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||||
| kernel_node_, | |||||
| cudnnGetConvolutionForwardAlgorithm_v7(cudnn_handle_, input_descriptor_real, filter_desc_, conv_desc_, | |||||
| output_desc_, requested_algo_count, &returned_algo_count, &perf_results), | |||||
| "cudnnGetConvolutionForwardAlgorithm_v7 failed"); | |||||
| conv_algorithm_ = perf_results.algo; | |||||
| if (cudnn_data_type_ == CUDNN_DATA_HALF) { | if (cudnn_data_type_ == CUDNN_DATA_HALF) { | ||||
| conv_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; | conv_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; | ||||
| } | } | ||||
| @@ -280,23 +280,15 @@ class ConvGradFilterGpuBkwKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| void SelectAlgorithm(cudnnTensorDescriptor_t x_desc_real) { | void SelectAlgorithm(cudnnTensorDescriptor_t x_desc_real) { | ||||
| if (group_ > 1 || CUDNN_MAJOR < 7) { | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||||
| kernel_node_, | |||||
| cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle_, x_desc_real, dy_desc_, conv_desc_, dw_desc_, | |||||
| CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, 0, &algo_), | |||||
| "GetConvolutionBackwardFilterAlgorithm failed"); | |||||
| } else { | |||||
| constexpr int requested_algo_count = 1; | |||||
| int returned_algo_count; | |||||
| cudnnConvolutionBwdFilterAlgoPerf_t perf_results; | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||||
| kernel_node_, | |||||
| cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnn_handle_, x_desc_real, dy_desc_, conv_desc_, dw_desc_, | |||||
| requested_algo_count, &returned_algo_count, &perf_results), | |||||
| "GetConvolutionBackwardFilterAlgorithm failed"); | |||||
| algo_ = perf_results.algo; | |||||
| } | |||||
| constexpr int requested_algo_count = 1; | |||||
| int returned_algo_count = 0; | |||||
| cudnnConvolutionBwdFilterAlgoPerf_t perf_results; | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||||
| kernel_node_, | |||||
| cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnn_handle_, x_desc_real, dy_desc_, conv_desc_, dw_desc_, | |||||
| requested_algo_count, &returned_algo_count, &perf_results), | |||||
| "GetConvolutionBackwardFilterAlgorithm failed"); | |||||
| algo_ = perf_results.algo; | |||||
| if (cudnn_data_type_ == CUDNN_DATA_HALF) { | if (cudnn_data_type_ == CUDNN_DATA_HALF) { | ||||
| algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; | algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; | ||||
| } | } | ||||
| @@ -289,23 +289,15 @@ class ConvGradInputGpuBkwKernel : public GpuKernel { | |||||
| [](const int64_t &value) { return static_cast<int>(value); }); | [](const int64_t &value) { return static_cast<int>(value); }); | ||||
| } | } | ||||
| void SelectAlgorithm(cudnnTensorDescriptor_t dx_desc_real) { | void SelectAlgorithm(cudnnTensorDescriptor_t dx_desc_real) { | ||||
| if (group_ > 1 || CUDNN_MAJOR < 7) { | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||||
| kernel_node_, | |||||
| cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle_, w_desc_, dy_desc_, conv_desc_, dx_desc_real, | |||||
| CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, 0, &algo_), | |||||
| "cudnnGetConvolutionBackwardDataAlgorithm failed"); | |||||
| } else { | |||||
| constexpr int requested_algo_count = 1; | |||||
| int returned_algo_count; | |||||
| cudnnConvolutionBwdDataAlgoPerf_t perf_results; | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||||
| kernel_node_, | |||||
| cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnn_handle_, w_desc_, dy_desc_, conv_desc_, dx_desc_real, | |||||
| requested_algo_count, &returned_algo_count, &perf_results), | |||||
| "cudnnGetConvolutionBackwardDataAlgorithm_v7 failed"); | |||||
| algo_ = perf_results.algo; | |||||
| } | |||||
| constexpr int requested_algo_count = 1; | |||||
| int returned_algo_count = 0; | |||||
| cudnnConvolutionBwdDataAlgoPerf_t perf_results; | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT( | |||||
| kernel_node_, | |||||
| cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnn_handle_, w_desc_, dy_desc_, conv_desc_, dx_desc_real, | |||||
| requested_algo_count, &returned_algo_count, &perf_results), | |||||
| "cudnnGetConvolutionBackwardDataAlgorithm_v7 failed"); | |||||
| algo_ = perf_results.algo; | |||||
| if (cudnn_data_type_ == CUDNN_DATA_HALF) { | if (cudnn_data_type_ == CUDNN_DATA_HALF) { | ||||
| algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; | algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; | ||||
| } | } | ||||
| @@ -125,12 +125,21 @@ class LstmGpuKernel : public GpuKernel { | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | ||||
| cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0), | cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0), | ||||
| "set dropout_desc failed"); | "set dropout_desc failed"); | ||||
| cudnnRNNBiasMode_t bias_mode = has_bias_ ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS; | |||||
| #if CUDNN_VERSION < 8000 | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | ||||
| cudnnSetRNNDescriptor(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, | |||||
| input_mode, direction, rnn_mode, algo, cudnn_data_type_), | |||||
| cudnnSetRNNDescriptor_v6(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, | |||||
| input_mode, direction, rnn_mode, algo, cudnn_data_type_), | |||||
| "set rnn_desc failed"); | "set rnn_desc failed"); | ||||
| cudnnRNNBiasMode_t bias_mode = has_bias_ ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS; | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed"); | CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed"); | ||||
| #else | |||||
| cudnnMathType_t math_type = (cudnn_data_type_ == CUDNN_DATA_HALF) ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH; | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | |||||
| cudnnSetRNNDescriptor_v8(rnn_desc_, algo, rnn_mode, bias_mode, direction, input_mode, | |||||
| cudnn_data_type_, cudnn_data_type_, math_type, input_size_, | |||||
| hidden_size_, hidden_size_, num_layers_, dropout_desc_, 0), | |||||
| "set rnn_desc failed"); | |||||
| #endif | |||||
| auto weight_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3); | auto weight_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3); | ||||
| size_t weight_size = weight_shape[0] * weight_shape[1] * weight_shape[2] * sizeof(T); | size_t weight_size = weight_shape[0] * weight_shape[1] * weight_shape[2] * sizeof(T); | ||||
| CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | ||||
| @@ -140,12 +140,21 @@ class LstmGradDataGpuKernel : public GpuKernel { | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | ||||
| cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0), | cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0), | ||||
| "set dropout_desc failed"); | "set dropout_desc failed"); | ||||
| cudnnRNNBiasMode_t bias_mode = has_bias_ ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS; | |||||
| #if CUDNN_VERSION < 8000 | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | ||||
| cudnnSetRNNDescriptor(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, | |||||
| input_mode, direction, rnn_mode, algo, cudnn_data_type_), | |||||
| cudnnSetRNNDescriptor_v6(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, | |||||
| input_mode, direction, rnn_mode, algo, cudnn_data_type_), | |||||
| "set rnn_desc failed"); | "set rnn_desc failed"); | ||||
| cudnnRNNBiasMode_t bias_mode = has_bias_ ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS; | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed"); | CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed"); | ||||
| #else | |||||
| cudnnMathType_t math_type = (cudnn_data_type_ == CUDNN_DATA_HALF) ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH; | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | |||||
| cudnnSetRNNDescriptor_v8(rnn_desc_, algo, rnn_mode, bias_mode, direction, input_mode, | |||||
| cudnn_data_type_, cudnn_data_type_, math_type, input_size_, | |||||
| hidden_size_, hidden_size_, num_layers_, dropout_desc_, 0), | |||||
| "set rnn_desc failed"); | |||||
| #endif | |||||
| auto weight_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4); | auto weight_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 4); | ||||
| size_t weight_size = weight_shape[0] * weight_shape[1] * weight_shape[2] * sizeof(T); | size_t weight_size = weight_shape[0] * weight_shape[1] * weight_shape[2] * sizeof(T); | ||||
| CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | ||||
| @@ -114,13 +114,21 @@ class LstmGradWeightGpuKernel : public GpuKernel { | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | ||||
| cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0), | cudnnSetDropoutDescriptor(dropout_desc_, handle_, dropout_, nullptr, 0, 0), | ||||
| "set dropout_desc failed"); | "set dropout_desc failed"); | ||||
| cudnnRNNBiasMode_t bias_mode = has_bias_ ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS; | |||||
| #if CUDNN_VERSION < 8000 | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | ||||
| cudnnSetRNNDescriptor(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, | |||||
| input_mode, direction, rnn_mode, algo, cudnn_data_type_), | |||||
| cudnnSetRNNDescriptor_v6(handle_, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, | |||||
| input_mode, direction, rnn_mode, algo, cudnn_data_type_), | |||||
| "set rnn_desc failed"); | "set rnn_desc failed"); | ||||
| cudnnRNNBiasMode_t bias_mode = has_bias_ ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS; | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed"); | CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnSetRNNBiasMode(rnn_desc_, bias_mode), "set bias_mode failed"); | ||||
| #else | |||||
| cudnnMathType_t math_type = (cudnn_data_type_ == CUDNN_DATA_HALF) ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH; | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, | |||||
| cudnnSetRNNDescriptor_v8(rnn_desc_, algo, rnn_mode, bias_mode, direction, input_mode, | |||||
| cudnn_data_type_, cudnn_data_type_, math_type, input_size_, | |||||
| hidden_size_, hidden_size_, num_layers_, dropout_desc_, 0), | |||||
| "set rnn_desc failed"); | |||||
| #endif | |||||
| auto weight_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); | auto weight_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); | ||||
| size_t weight_size = weight_shape[0] * weight_shape[1] * weight_shape[2] * sizeof(T); | size_t weight_size = weight_shape[0] * weight_shape[1] * weight_shape[2] * sizeof(T); | ||||