| @@ -22,10 +22,12 @@ MS_REG_GPU_KERNEL_TWO( | |||||
| GatherV2, | GatherV2, | ||||
| KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32), | KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32), | ||||
| GatherV2GpuFwdKernel, float, int) | GatherV2GpuFwdKernel, float, int) | ||||
| MS_REG_GPU_KERNEL_TWO( | MS_REG_GPU_KERNEL_TWO( | ||||
| GatherV2, | GatherV2, | ||||
| KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16), | KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16), | ||||
| GatherV2GpuFwdKernel, half, int) | GatherV2GpuFwdKernel, half, int) | ||||
| MS_REG_GPU_KERNEL_TWO(GatherV2, | MS_REG_GPU_KERNEL_TWO(GatherV2, | ||||
| KernelAttr() | KernelAttr() | ||||
| .AddInputAttr(kNumberTypeFloat32) | .AddInputAttr(kNumberTypeFloat32) | ||||
| @@ -33,6 +35,7 @@ MS_REG_GPU_KERNEL_TWO(GatherV2, | |||||
| .AddInputAttr(kNumberTypeInt64) | .AddInputAttr(kNumberTypeInt64) | ||||
| .AddOutputAttr(kNumberTypeFloat32), | .AddOutputAttr(kNumberTypeFloat32), | ||||
| GatherV2GpuFwdKernel, float, int) | GatherV2GpuFwdKernel, float, int) | ||||
| MS_REG_GPU_KERNEL_TWO(GatherV2, | MS_REG_GPU_KERNEL_TWO(GatherV2, | ||||
| KernelAttr() | KernelAttr() | ||||
| .AddInputAttr(kNumberTypeFloat16) | .AddInputAttr(kNumberTypeFloat16) | ||||
| @@ -40,14 +43,17 @@ MS_REG_GPU_KERNEL_TWO(GatherV2, | |||||
| .AddInputAttr(kNumberTypeInt64) | .AddInputAttr(kNumberTypeInt64) | ||||
| .AddOutputAttr(kNumberTypeFloat16), | .AddOutputAttr(kNumberTypeFloat16), | ||||
| GatherV2GpuFwdKernel, half, int) | GatherV2GpuFwdKernel, half, int) | ||||
| MS_REG_GPU_KERNEL_TWO( | MS_REG_GPU_KERNEL_TWO( | ||||
| SparseGatherV2, | SparseGatherV2, | ||||
| KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32), | KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32), | ||||
| GatherV2GpuFwdKernel, float, int) | GatherV2GpuFwdKernel, float, int) | ||||
| MS_REG_GPU_KERNEL_TWO( | MS_REG_GPU_KERNEL_TWO( | ||||
| SparseGatherV2, | SparseGatherV2, | ||||
| KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16), | KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat16), | ||||
| GatherV2GpuFwdKernel, half, int) | GatherV2GpuFwdKernel, half, int) | ||||
| MS_REG_GPU_KERNEL_TWO(SparseGatherV2, | MS_REG_GPU_KERNEL_TWO(SparseGatherV2, | ||||
| KernelAttr() | KernelAttr() | ||||
| .AddInputAttr(kNumberTypeFloat32) | .AddInputAttr(kNumberTypeFloat32) | ||||
| @@ -55,6 +61,7 @@ MS_REG_GPU_KERNEL_TWO(SparseGatherV2, | |||||
| .AddInputAttr(kNumberTypeInt64) | .AddInputAttr(kNumberTypeInt64) | ||||
| .AddOutputAttr(kNumberTypeFloat32), | .AddOutputAttr(kNumberTypeFloat32), | ||||
| GatherV2GpuFwdKernel, float, int) | GatherV2GpuFwdKernel, float, int) | ||||
| MS_REG_GPU_KERNEL_TWO(SparseGatherV2, | MS_REG_GPU_KERNEL_TWO(SparseGatherV2, | ||||
| KernelAttr() | KernelAttr() | ||||
| .AddInputAttr(kNumberTypeFloat16) | .AddInputAttr(kNumberTypeFloat16) | ||||
| @@ -18,7 +18,6 @@ | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| MS_REG_GPU_KERNEL_TWO( | MS_REG_GPU_KERNEL_TWO( | ||||
| UnsortedSegmentSum, | UnsortedSegmentSum, | ||||
| KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32), | KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32), | ||||
| @@ -85,10 +85,7 @@ bool DatasetIteratorKernel::Init(const CNodePtr &kernel_node) { | |||||
| void DatasetIteratorKernel::InitSizeLists() { return; } | void DatasetIteratorKernel::InitSizeLists() { return; } | ||||
| bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, | |||||
| const std::vector<AddressPtr> &outputs, void *stream) { | |||||
| void *addr = nullptr; | |||||
| size_t len = 0; | |||||
| bool DatasetIteratorKernel::ReadDevice(void **addr, size_t *len) { | |||||
| uint64_t start_time_stamp = 0; | uint64_t start_time_stamp = 0; | ||||
| uint32_t queue_size = 0; | uint32_t queue_size = 0; | ||||
| @@ -98,7 +95,7 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v | |||||
| start_time_stamp = profiling_op_->GetTimeStamp(); | start_time_stamp = profiling_op_->GetTimeStamp(); | ||||
| queue_size = GpuBufferMgr::GetInstance().Size(handle_); | queue_size = GpuBufferMgr::GetInstance().Size(handle_); | ||||
| } | } | ||||
| auto ret = GpuBufferMgr::GetInstance().Front(handle_, &addr, &len); | |||||
| auto ret = GpuBufferMgr::GetInstance().Front(handle_, addr, len); | |||||
| if (ret == device::SUCCESS) { | if (ret == device::SUCCESS) { | ||||
| if (profiling_enable_) { | if (profiling_enable_) { | ||||
| uint64_t end_time_stamp = profiling_op_->GetTimeStamp(); | uint64_t end_time_stamp = profiling_op_->GetTimeStamp(); | ||||
| @@ -129,7 +126,16 @@ bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::v | |||||
| MS_LOG(ERROR) << "Get data failed, errcode " << ret; | MS_LOG(ERROR) << "Get data failed, errcode " << ret; | ||||
| return false; | return false; | ||||
| } | } | ||||
| return true; | |||||
| } | |||||
| bool DatasetIteratorKernel::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, | |||||
| const std::vector<AddressPtr> &outputs, void *stream) { | |||||
| void *addr = nullptr; | |||||
| size_t len = 0; | |||||
| if (!ReadDevice(&addr, &len)) { | |||||
| return false; | |||||
| } | |||||
| if (total_bytes_ != len) { | if (total_bytes_ != len) { | ||||
| MS_LOG(ERROR) << "Dataset front error. read: " << len << ", expect: " << total_bytes_ << ", "; | MS_LOG(ERROR) << "Dataset front error. read: " << len << ", expect: " << total_bytes_ << ", "; | ||||
| return false; | return false; | ||||
| @@ -43,6 +43,7 @@ class DatasetIteratorKernel : public GpuKernel { | |||||
| void InitSizeLists() override; | void InitSizeLists() override; | ||||
| private: | private: | ||||
| bool ReadDevice(void **addr, size_t *len); | |||||
| std::string queue_name_; | std::string queue_name_; | ||||
| unsigned int handle_; | unsigned int handle_; | ||||
| size_t total_bytes_; | size_t total_bytes_; | ||||
| @@ -107,10 +107,23 @@ bool GpuKernelFactory::ReducePrecision( | |||||
| return GpuKernelFactory::SearchRegistered(kernel_name, builder->Build()); | return GpuKernelFactory::SearchRegistered(kernel_name, builder->Build()); | ||||
| } | } | ||||
| void GpuKernelFactory::CheckSM(const KernelBuildInfo *kernel_info, const size_t &input_index) { | |||||
| const int major_sm = GET_MAJOR_SM; | |||||
| const bool check_sm = mindspore::device::gpu::CudaCommon::GetInstance().check_sm(); | |||||
| if (check_sm && major_sm < RECOMMEND_SM && kernel_info->GetInputDeviceType(input_index) == kNumberTypeFloat16) { | |||||
| if (major_sm < MINIUM_SM) { | |||||
| MS_LOG(EXCEPTION) << "Half precision ops can be used on Devices which computing capacity is >= " << MINIUM_SM | |||||
| << ", but the current device's computing capacity is " << major_sm; | |||||
| } | |||||
| MS_LOG(WARNING) << "It is recommended to use devices with a computing capacity >= " << RECOMMEND_SM | |||||
| << ", but the current device's computing capacity is " << major_sm; | |||||
| mindspore::device::gpu::CudaCommon::GetInstance().set_check_sm(false); | |||||
| } | |||||
| } | |||||
| std::pair<bool, size_t> GpuKernelFactory::GpuKernelAttrCheck(const std::string &kernel_name, | std::pair<bool, size_t> GpuKernelFactory::GpuKernelAttrCheck(const std::string &kernel_name, | ||||
| const KernelBuildInfo *kernel_info) { | const KernelBuildInfo *kernel_info) { | ||||
| auto iter = map_kernel_name_to_creater_.find(kernel_name); | auto iter = map_kernel_name_to_creater_.find(kernel_name); | ||||
| const int marjor_sm = GET_MAJOR_SM; | |||||
| if (map_kernel_name_to_creater_.end() == iter) { | if (map_kernel_name_to_creater_.end() == iter) { | ||||
| MS_LOG(INFO) << "Not registered GPU kernel: op[" << kernel_name << "]!"; | MS_LOG(INFO) << "Not registered GPU kernel: op[" << kernel_name << "]!"; | ||||
| return std::make_pair(false, 0); | return std::make_pair(false, 0); | ||||
| @@ -127,16 +140,7 @@ std::pair<bool, size_t> GpuKernelFactory::GpuKernelAttrCheck(const std::string & | |||||
| auto attr_size = (&(iter->second))->at(attr_index).first.GetInputSize(); | auto attr_size = (&(iter->second))->at(attr_index).first.GetInputSize(); | ||||
| // data type matching check of all input parameters of kernel | // data type matching check of all input parameters of kernel | ||||
| for (size_t input_index = 0; input_index < kernel_info->GetInputNum(); input_index++) { | for (size_t input_index = 0; input_index < kernel_info->GetInputNum(); input_index++) { | ||||
| const bool check_sm = mindspore::device::gpu::CudaCommon::GetInstance().check_sm(); | |||||
| if (check_sm && marjor_sm < RECOMMEND_SM && kernel_info->GetInputDeviceType(input_index) == kNumberTypeFloat16) { | |||||
| if (marjor_sm < MINIUM_SM) { | |||||
| MS_LOG(EXCEPTION) << "Half precision ops can be used on Devices which computing capacity is >= " << MINIUM_SM | |||||
| << ", but the current device's computing capacity is " << marjor_sm; | |||||
| } | |||||
| MS_LOG(WARNING) << "It is recommended to use devices with a computing capacity >= " << RECOMMEND_SM | |||||
| << ", but the current device's computing capacity is " << marjor_sm; | |||||
| mindspore::device::gpu::CudaCommon::GetInstance().set_check_sm(false); | |||||
| } | |||||
| GpuKernelFactory::CheckSM(kernel_info, input_index); | |||||
| if (kernel_info->GetInputDeviceType(input_index) != | if (kernel_info->GetInputDeviceType(input_index) != | ||||
| (iter->second)[attr_index].first.GetInputAttr(input_index % attr_size).first) { | (iter->second)[attr_index].first.GetInputAttr(input_index % attr_size).first) { | ||||
| flag = false; | flag = false; | ||||
| @@ -57,6 +57,7 @@ class GpuKernelFactory { | |||||
| GpuKernelFactory &operator=(const GpuKernelFactory &); | GpuKernelFactory &operator=(const GpuKernelFactory &); | ||||
| std::pair<bool, size_t> GpuKernelAttrCheck(const std::string &kernel_name, const KernelBuildInfo *kernel_info); | std::pair<bool, size_t> GpuKernelAttrCheck(const std::string &kernel_name, const KernelBuildInfo *kernel_info); | ||||
| void CheckSM(const KernelBuildInfo *kernel_info, const size_t &input_index); | |||||
| bool CheckIOParam(const std::string &kernel_name, const KernelBuildInfo *kernel_info, | bool CheckIOParam(const std::string &kernel_name, const KernelBuildInfo *kernel_info, | ||||
| std::vector<std::pair<KernelAttr, GpuKernelCreater>> *iter_second, size_t attr_index); | std::vector<std::pair<KernelAttr, GpuKernelCreater>> *iter_second, size_t attr_index); | ||||
| // map to maintain kernel and creater, KernelAttr object and creater must be registered as a pair. | // map to maintain kernel and creater, KernelAttr object and creater must be registered as a pair. | ||||
| @@ -44,61 +44,9 @@ class CholeskyTrsmGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| if (!use_split_matrix) { | if (!use_split_matrix) { | ||||
| auto input1_addr = GetDeviceAddress<T>(inputs, 0); | |||||
| auto output_addr = GetDeviceAddress<T>(outputs, 0); | |||||
| auto d_array_addr = GetDeviceAddress<T *>(workspace, 0); | |||||
| auto d_identity_addr = GetDeviceAddress<T *>(workspace, 1); | |||||
| auto d_info_array_addr = GetDeviceAddress<int>(workspace, 2); | |||||
| for (size_t i = 0; i < batch_; i++) { | |||||
| h_array[i] = input1_addr + i * lda_ * m_; | |||||
| h_identity[i] = output_addr + i * ldb_ * m_; | |||||
| CHECK_CUDA_RET_WITH_ERROR( | |||||
| cudaMemcpyAsync(output_addr + i * ldb_ * m_, h_identity_data.data(), sizeof(T) * ldb_ * m_, | |||||
| cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)), | |||||
| "cuda memcopy Fail"); | |||||
| } | |||||
| CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_, | |||||
| cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)), | |||||
| "cuda memcopy Fail"); | |||||
| CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_, | |||||
| cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)), | |||||
| "cuda memcopy Fail"); | |||||
| CHECK_CUSOLVER_RET_WITH_EXCEPT( | |||||
| cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_), | |||||
| "cusolver cholesky batched Fail"); | |||||
| float alpha = 1; | |||||
| CHECK_CUBLAS_RET_WITH_EXCEPT( | |||||
| cublasStrsmBatched(blas_handle_, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, m_, m_, &alpha, | |||||
| d_array_addr, lda_, d_identity_addr, ldb_, batch_), | |||||
| "cublas trsm batched Fail"); | |||||
| LaunchNonSplitMatrix(inputs, workspace, outputs, stream_ptr); | |||||
| } else { | } else { | ||||
| auto input1_addr = GetDeviceAddress<T>(inputs, 0); | |||||
| auto output_addr = GetDeviceAddress<T>(outputs, 0); | |||||
| auto d_array_addr = GetDeviceAddress<T *>(workspace, 0); | |||||
| auto d_identity_addr = GetDeviceAddress<T *>(workspace, 1); | |||||
| auto d_info_array_addr = GetDeviceAddress<int>(workspace, 2); | |||||
| auto d_batch_input_addr = GetDeviceAddress<T>(workspace, 3); | |||||
| for (size_t i = 0; i < batch_; i++) { | |||||
| h_array[i] = d_batch_input_addr + i * lda_ * m_; | |||||
| h_identity[i] = output_addr + i * ldb_ * m_; | |||||
| } | |||||
| Identity(batch_ * split_dim * split_dim, split_dim, output_addr, reinterpret_cast<cudaStream_t>(stream_ptr)); | |||||
| MatrixSplit(batch_ * split_dim * split_dim, split_dim, width, input1_addr, d_batch_input_addr, | |||||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | |||||
| CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_, | |||||
| cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)), | |||||
| "cuda memcopy Fail"); | |||||
| CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_, | |||||
| cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)), | |||||
| "cuda memcopy Fail"); | |||||
| CHECK_CUSOLVER_RET_WITH_EXCEPT( | |||||
| cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_), | |||||
| "cusolver cholesky batched Fail"); | |||||
| float alpha = 1; | |||||
| CHECK_CUBLAS_RET_WITH_EXCEPT( | |||||
| cublasStrsmBatched(blas_handle_, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, m_, m_, &alpha, | |||||
| d_array_addr, lda_, d_identity_addr, ldb_, batch_), | |||||
| "cublas trsm batched Fail"); | |||||
| LaunchSplitMatrix(inputs, workspace, outputs, stream_ptr); | |||||
| } | } | ||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -108,92 +56,17 @@ class CholeskyTrsmGpuKernel : public GpuKernel { | |||||
| auto in_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | auto in_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | ||||
| split_dim = static_cast<int>(GetAttr<int64_t>(kernel_node, "split_dim")); | split_dim = static_cast<int>(GetAttr<int64_t>(kernel_node, "split_dim")); | ||||
| if (split_dim == 0) { | if (split_dim == 0) { | ||||
| use_split_matrix = false; | |||||
| if (in_shape.size() == 2) { | |||||
| batch_ = 1; | |||||
| if (in_shape[0] != in_shape[1]) { | |||||
| MS_LOG(ERROR) << "CholeskyTrsm need square matrix as input."; | |||||
| } | |||||
| } else if (in_shape.size() == 3) { | |||||
| batch_ = SizeToInt(in_shape[0]); | |||||
| if (in_shape[1] != in_shape[2]) { | |||||
| MS_LOG(ERROR) << "CholeskyTrsm need square matrix as input."; | |||||
| } | |||||
| } else { | |||||
| MS_LOG(ERROR) << "Input Only support Rank 2 OR 3"; | |||||
| } | |||||
| m_ = SizeToInt(in_shape[1]); | |||||
| lda_ = m_; | |||||
| ldb_ = m_; | |||||
| h_array.resize(batch_); | |||||
| h_identity.resize(batch_); | |||||
| h_identity_data.resize(m_ * m_); | |||||
| for (size_t i = 0; i < m_; i++) { | |||||
| for (size_t j = 0; j < m_; j++) { | |||||
| if (i == j) { | |||||
| h_identity_data[i * m_ + j] = 1; | |||||
| } else { | |||||
| h_identity_data[i * m_ + j] = 0; | |||||
| } | |||||
| } | |||||
| } | |||||
| InitSizeLists(); | |||||
| InitDim0(kernel_node, in_shape); | |||||
| } else { | } else { | ||||
| if (in_shape.size() != 2) { | if (in_shape.size() != 2) { | ||||
| MS_LOG(ERROR) << "CholeskyTrsm Split Matrix Need Input Rank as 2."; | MS_LOG(ERROR) << "CholeskyTrsm Split Matrix Need Input Rank as 2."; | ||||
| } | } | ||||
| height = in_shape[0]; | |||||
| width = in_shape[1]; | |||||
| if (height != width) { | |||||
| if (in_shape[0] != in_shape[1]) { | |||||
| MS_LOG(ERROR) << "CholeskyTrsm Split Matrix Need Square Matrix as Input."; | MS_LOG(ERROR) << "CholeskyTrsm Split Matrix Need Square Matrix as Input."; | ||||
| } | } | ||||
| if (SizeToInt(height) <= split_dim) { | |||||
| use_split_matrix = false; | |||||
| batch_ = 1; | |||||
| m_ = SizeToInt(in_shape[1]); | |||||
| lda_ = m_; | |||||
| ldb_ = m_; | |||||
| h_array.resize(batch_); | |||||
| h_identity.resize(batch_); | |||||
| h_identity_data.resize(m_ * m_); | |||||
| for (size_t i = 0; i < m_; i++) { | |||||
| for (size_t j = 0; j < m_; j++) { | |||||
| if (i == j) { | |||||
| h_identity_data[i * m_ + j] = 1; | |||||
| } else { | |||||
| h_identity_data[i * m_ + j] = 0; | |||||
| } | |||||
| } | |||||
| } | |||||
| InitSizeLists(); | |||||
| } else { | |||||
| use_split_matrix = true; | |||||
| int batch = SizeToInt(in_shape[1]) / split_dim; | |||||
| res_dim = in_shape[1] - batch * split_dim; | |||||
| if (res_dim == 0) { | |||||
| batch_ = batch; | |||||
| } else { | |||||
| batch_ = batch + 1; | |||||
| } | |||||
| m_ = split_dim; | |||||
| lda_ = m_; | |||||
| ldb_ = m_; | |||||
| h_array.resize(batch_); | |||||
| h_identity.resize(batch_); | |||||
| h_identity_data.resize(m_ * m_); | |||||
| for (size_t i = 0; i < m_; i++) { | |||||
| for (size_t j = 0; j < m_; j++) { | |||||
| if (i == j) { | |||||
| h_identity_data[i * m_ + j] = 1; | |||||
| } else { | |||||
| h_identity_data[i * m_ + j] = 0; | |||||
| } | |||||
| } | |||||
| } | |||||
| InitSizeLists(); | |||||
| } | |||||
| InitDimOthers(kernel_node, in_shape); | |||||
| } | } | ||||
| InitSizeLists(); | |||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -229,6 +102,145 @@ class CholeskyTrsmGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| void LaunchNonSplitMatrix(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | |||||
| auto input1_addr = GetDeviceAddress<T>(inputs, 0); | |||||
| auto output_addr = GetDeviceAddress<T>(outputs, 0); | |||||
| auto d_array_addr = GetDeviceAddress<T *>(workspace, 0); | |||||
| auto d_identity_addr = GetDeviceAddress<T *>(workspace, 1); | |||||
| auto d_info_array_addr = GetDeviceAddress<int>(workspace, 2); | |||||
| for (size_t i = 0; i < batch_; i++) { | |||||
| h_array[i] = input1_addr + i * lda_ * m_; | |||||
| h_identity[i] = output_addr + i * ldb_ * m_; | |||||
| CHECK_CUDA_RET_WITH_ERROR( | |||||
| cudaMemcpyAsync(output_addr + i * ldb_ * m_, h_identity_data.data(), sizeof(T) * ldb_ * m_, | |||||
| cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)), | |||||
| "cuda memcopy Fail"); | |||||
| } | |||||
| CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_, | |||||
| cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)), | |||||
| "cuda memcopy Fail"); | |||||
| CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_, | |||||
| cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)), | |||||
| "cuda memcopy Fail"); | |||||
| CHECK_CUSOLVER_RET_WITH_EXCEPT( | |||||
| cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_), | |||||
| "cusolver cholesky batched Fail"); | |||||
| float alpha = 1; | |||||
| CHECK_CUBLAS_RET_WITH_EXCEPT( | |||||
| cublasStrsmBatched(blas_handle_, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, m_, m_, &alpha, | |||||
| d_array_addr, lda_, d_identity_addr, ldb_, batch_), | |||||
| "cublas trsm batched Fail"); | |||||
| } | |||||
| void LaunchSplitMatrix(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | |||||
| auto input1_addr = GetDeviceAddress<T>(inputs, 0); | |||||
| auto output_addr = GetDeviceAddress<T>(outputs, 0); | |||||
| auto d_array_addr = GetDeviceAddress<T *>(workspace, 0); | |||||
| auto d_identity_addr = GetDeviceAddress<T *>(workspace, 1); | |||||
| auto d_info_array_addr = GetDeviceAddress<int>(workspace, 2); | |||||
| auto d_batch_input_addr = GetDeviceAddress<T>(workspace, 3); | |||||
| for (size_t i = 0; i < batch_; i++) { | |||||
| h_array[i] = d_batch_input_addr + i * lda_ * m_; | |||||
| h_identity[i] = output_addr + i * ldb_ * m_; | |||||
| } | |||||
| Identity(batch_ * split_dim * split_dim, split_dim, output_addr, reinterpret_cast<cudaStream_t>(stream_ptr)); | |||||
| MatrixSplit(batch_ * split_dim * split_dim, split_dim, width, input1_addr, d_batch_input_addr, | |||||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | |||||
| CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_array_addr, h_array.data(), sizeof(T *) * batch_, | |||||
| cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)), | |||||
| "cuda memcopy Fail"); | |||||
| CHECK_CUDA_RET_WITH_ERROR(cudaMemcpyAsync(d_identity_addr, h_identity.data(), sizeof(T *) * batch_, | |||||
| cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)), | |||||
| "cuda memcopy Fail"); | |||||
| CHECK_CUSOLVER_RET_WITH_EXCEPT( | |||||
| cusolverDnSpotrfBatched(handle_, uplo, m_, d_array_addr, lda_, d_info_array_addr, batch_), | |||||
| "cusolver cholesky batched Fail"); | |||||
| float alpha = 1; | |||||
| CHECK_CUBLAS_RET_WITH_EXCEPT( | |||||
| cublasStrsmBatched(blas_handle_, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, m_, m_, &alpha, | |||||
| d_array_addr, lda_, d_identity_addr, ldb_, batch_), | |||||
| "cublas trsm batched Fail"); | |||||
| } | |||||
| void InitDim0(const CNodePtr &kernel_node, const std::vector<size_t> &in_shape) { | |||||
| use_split_matrix = false; | |||||
| if (in_shape.size() == 2) { | |||||
| batch_ = 1; | |||||
| if (in_shape[0] != in_shape[1]) { | |||||
| MS_LOG(ERROR) << "CholeskyTrsm need square matrix as input."; | |||||
| } | |||||
| } else if (in_shape.size() == 3) { | |||||
| batch_ = SizeToInt(in_shape[0]); | |||||
| if (in_shape[1] != in_shape[2]) { | |||||
| MS_LOG(ERROR) << "CholeskyTrsm need square matrix as input."; | |||||
| } | |||||
| } else { | |||||
| MS_LOG(ERROR) << "Input Only support Rank 2 OR 3"; | |||||
| } | |||||
| m_ = SizeToInt(in_shape[1]); | |||||
| lda_ = m_; | |||||
| ldb_ = m_; | |||||
| h_array.resize(batch_); | |||||
| h_identity.resize(batch_); | |||||
| h_identity_data.resize(m_ * m_); | |||||
| for (size_t i = 0; i < m_; i++) { | |||||
| for (size_t j = 0; j < m_; j++) { | |||||
| if (i == j) { | |||||
| h_identity_data[i * m_ + j] = 1; | |||||
| } else { | |||||
| h_identity_data[i * m_ + j] = 0; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| void InitDimOthers(const CNodePtr &kernel_node, const std::vector<size_t> &in_shape) { | |||||
| height = in_shape[0]; | |||||
| width = in_shape[1]; | |||||
| if (SizeToInt(height) <= split_dim) { | |||||
| use_split_matrix = false; | |||||
| batch_ = 1; | |||||
| m_ = SizeToInt(in_shape[1]); | |||||
| lda_ = m_; | |||||
| ldb_ = m_; | |||||
| h_array.resize(batch_); | |||||
| h_identity.resize(batch_); | |||||
| h_identity_data.resize(m_ * m_); | |||||
| for (size_t i = 0; i < m_; i++) { | |||||
| for (size_t j = 0; j < m_; j++) { | |||||
| if (i == j) { | |||||
| h_identity_data[i * m_ + j] = 1; | |||||
| } else { | |||||
| h_identity_data[i * m_ + j] = 0; | |||||
| } | |||||
| } | |||||
| } | |||||
| } else { | |||||
| use_split_matrix = true; | |||||
| int batch = SizeToInt(in_shape[1]) / split_dim; | |||||
| res_dim = in_shape[1] - batch * split_dim; | |||||
| if (res_dim == 0) { | |||||
| batch_ = batch; | |||||
| } else { | |||||
| batch_ = batch + 1; | |||||
| } | |||||
| m_ = split_dim; | |||||
| lda_ = m_; | |||||
| ldb_ = m_; | |||||
| h_array.resize(batch_); | |||||
| h_identity.resize(batch_); | |||||
| h_identity_data.resize(m_ * m_); | |||||
| for (size_t i = 0; i < m_; i++) { | |||||
| for (size_t j = 0; j < m_; j++) { | |||||
| if (i == j) { | |||||
| h_identity_data[i * m_ + j] = 1; | |||||
| } else { | |||||
| h_identity_data[i * m_ + j] = 0; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| size_t batch_; | size_t batch_; | ||||
| size_t m_; | size_t m_; | ||||
| size_t lda_; | size_t lda_; | ||||
| @@ -159,8 +159,8 @@ class UpdateThorGradientGpuKernel : public GpuKernel { | |||||
| size_t output_size = gradient_size.ori_h * gradient_size.ori_w * unit_size; | size_t output_size = gradient_size.ori_h * gradient_size.ori_w * unit_size; | ||||
| output_size_list_.push_back(output_size); | output_size_list_.push_back(output_size); | ||||
| size_t workspace_size_ = 0; | |||||
| workspace_size_ = gradient_size.w * gradient_size.h * gradient_size.batch_w * gradient_size.batch_h * unit_size; | |||||
| size_t workspace_size_ = | |||||
| gradient_size.w * gradient_size.h * gradient_size.batch_w * gradient_size.batch_h * unit_size; | |||||
| workspace_size_list_.push_back(workspace_size_); | workspace_size_list_.push_back(workspace_size_); | ||||
| if (gradient_size.need_convert) { | if (gradient_size.need_convert) { | ||||
| @@ -56,51 +56,24 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel { | |||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | ||||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | ||||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | ||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, | ||||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | ||||
| T *input_addr = GetDeviceAddress<T>(inputs, 0); | |||||
| T *output_addr = GetDeviceAddress<T>(outputs, 0); | |||||
| cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr); | |||||
| switch (nccl_kernel_type_) { | switch (nccl_kernel_type_) { | ||||
| case NCCL_ALL_REDUCE: { | case NCCL_ALL_REDUCE: { | ||||
| auto all_reduce_funcptr = | |||||
| reinterpret_cast<AllReduce>(dlsym(const_cast<void *>(collective_handle_), "AllReduce")); | |||||
| MS_EXCEPTION_IF_NULL(all_reduce_funcptr); | |||||
| CHECK_NCCL_RET_WITH_EXCEPT((*all_reduce_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), | |||||
| nccl_data_type_, nccl_reduce_type_, stream, group_name_), | |||||
| "ncclAllReduce failed"); | |||||
| LaunchAllReduce(inputs, outputs, stream_ptr); | |||||
| break; | break; | ||||
| } | } | ||||
| case NCCL_ALL_GATHER: { | case NCCL_ALL_GATHER: { | ||||
| auto all_gather_funcptr = | |||||
| reinterpret_cast<AllGather>(dlsym(const_cast<void *>(collective_handle_), "AllGather")); | |||||
| MS_EXCEPTION_IF_NULL(all_gather_funcptr); | |||||
| CHECK_NCCL_RET_WITH_EXCEPT( | |||||
| (*all_gather_funcptr)(input_addr, output_addr, input_size_ / sizeof(T), nccl_data_type_, stream, group_name_), | |||||
| "ncclAllGather failed"); | |||||
| LaunchAllGather(inputs, outputs, stream_ptr); | |||||
| break; | break; | ||||
| } | } | ||||
| case NCCL_REDUCE_SCATTER: { | case NCCL_REDUCE_SCATTER: { | ||||
| auto reduce_scatter_funcptr = | |||||
| reinterpret_cast<ReduceScatter>(dlsym(const_cast<void *>(collective_handle_), "ReduceScatter")); | |||||
| MS_EXCEPTION_IF_NULL(reduce_scatter_funcptr); | |||||
| CHECK_NCCL_RET_WITH_EXCEPT((*reduce_scatter_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), | |||||
| nccl_data_type_, nccl_reduce_type_, stream, group_name_), | |||||
| "ncclReduceScatter failed"); | |||||
| LaunchReduceScatter(inputs, outputs, stream_ptr); | |||||
| break; | break; | ||||
| } | } | ||||
| case NCCL_BROADCAST: { | case NCCL_BROADCAST: { | ||||
| auto broadcast_funcptr = | |||||
| reinterpret_cast<Broadcast>(dlsym(const_cast<void *>(collective_handle_), "Broadcast")); | |||||
| MS_EXCEPTION_IF_NULL(broadcast_funcptr); | |||||
| for (int i = 0; i < SizeToInt(input_size_list_.size()); ++i) { | |||||
| input_addr = GetDeviceAddress<T>(inputs, i); | |||||
| output_addr = GetDeviceAddress<T>(outputs, i); | |||||
| CHECK_NCCL_RET_WITH_EXCEPT((*broadcast_funcptr)(input_addr, output_addr, output_size_list_[i] / sizeof(T), | |||||
| nccl_data_type_, root_, stream, group_name_), | |||||
| "ncclBroadcast failed"); | |||||
| } | |||||
| LaunchBroadcast(inputs, outputs, stream_ptr); | |||||
| break; | break; | ||||
| } | } | ||||
| default: { | default: { | ||||
| @@ -153,6 +126,59 @@ class NcclCollectiveGpuKernel : public NcclGpuKernel { | |||||
| void InitSizeLists() override { return; } | void InitSizeLists() override { return; } | ||||
| private: | private: | ||||
| void LaunchAllReduce(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs, | |||||
| void *stream_ptr) { | |||||
| T *input_addr = GetDeviceAddress<T>(inputs, 0); | |||||
| T *output_addr = GetDeviceAddress<T>(outputs, 0); | |||||
| cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr); | |||||
| auto all_reduce_funcptr = reinterpret_cast<AllReduce>(dlsym(const_cast<void *>(collective_handle_), "AllReduce")); | |||||
| MS_EXCEPTION_IF_NULL(all_reduce_funcptr); | |||||
| CHECK_NCCL_RET_WITH_EXCEPT((*all_reduce_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), nccl_data_type_, | |||||
| nccl_reduce_type_, stream, group_name_), | |||||
| "ncclAllReduce failed"); | |||||
| } | |||||
| void LaunchAllGather(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs, | |||||
| void *stream_ptr) { | |||||
| T *input_addr = GetDeviceAddress<T>(inputs, 0); | |||||
| T *output_addr = GetDeviceAddress<T>(outputs, 0); | |||||
| cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr); | |||||
| auto all_gather_funcptr = reinterpret_cast<AllGather>(dlsym(const_cast<void *>(collective_handle_), "AllGather")); | |||||
| MS_EXCEPTION_IF_NULL(all_gather_funcptr); | |||||
| CHECK_NCCL_RET_WITH_EXCEPT( | |||||
| (*all_gather_funcptr)(input_addr, output_addr, input_size_ / sizeof(T), nccl_data_type_, stream, group_name_), | |||||
| "ncclAllGather failed"); | |||||
| } | |||||
| void LaunchReduceScatter(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs, | |||||
| void *stream_ptr) { | |||||
| T *input_addr = GetDeviceAddress<T>(inputs, 0); | |||||
| T *output_addr = GetDeviceAddress<T>(outputs, 0); | |||||
| cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr); | |||||
| auto reduce_scatter_funcptr = | |||||
| reinterpret_cast<ReduceScatter>(dlsym(const_cast<void *>(collective_handle_), "ReduceScatter")); | |||||
| MS_EXCEPTION_IF_NULL(reduce_scatter_funcptr); | |||||
| CHECK_NCCL_RET_WITH_EXCEPT((*reduce_scatter_funcptr)(input_addr, output_addr, output_size_ / sizeof(T), | |||||
| nccl_data_type_, nccl_reduce_type_, stream, group_name_), | |||||
| "ncclReduceScatter failed"); | |||||
| } | |||||
| void LaunchBroadcast(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs, | |||||
| void *stream_ptr) { | |||||
| T *input_addr = GetDeviceAddress<T>(inputs, 0); | |||||
| T *output_addr = GetDeviceAddress<T>(outputs, 0); | |||||
| cudaStream_t stream = comm_stream_ ? comm_stream_ : reinterpret_cast<cudaStream_t>(stream_ptr); | |||||
| auto broadcast_funcptr = reinterpret_cast<Broadcast>(dlsym(const_cast<void *>(collective_handle_), "Broadcast")); | |||||
| MS_EXCEPTION_IF_NULL(broadcast_funcptr); | |||||
| for (int i = 0; i < SizeToInt(input_size_list_.size()); ++i) { | |||||
| input_addr = GetDeviceAddress<T>(inputs, i); | |||||
| output_addr = GetDeviceAddress<T>(outputs, i); | |||||
| CHECK_NCCL_RET_WITH_EXCEPT((*broadcast_funcptr)(input_addr, output_addr, output_size_list_[i] / sizeof(T), | |||||
| nccl_data_type_, root_, stream, group_name_), | |||||
| "ncclBroadcast failed"); | |||||
| } | |||||
| } | |||||
| void InferCommType(const CNodePtr &kernel_node) { | void InferCommType(const CNodePtr &kernel_node) { | ||||
| std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); | std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node); | ||||
| auto iter = kNcclTypeMap.find(kernel_name); | auto iter = kNcclTypeMap.find(kernel_name); | ||||
| @@ -44,25 +44,76 @@ class CtcLossGpuKernel : public GpuKernel { | |||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | ||||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | ||||
| LaunchInit(inputs, workspace, outputs); | |||||
| LaunchFirstHalf(inputs, workspace, outputs, stream_ptr); | |||||
| LaunchSecondHalf(inputs, workspace, outputs, stream_ptr); | |||||
| return true; | |||||
| } | |||||
| bool Init(const CNodePtr &kernel_node) override { | |||||
| InitResource(); | |||||
| auto probs_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||||
| if (probs_shape.size() != 3) { | |||||
| MS_LOG(EXCEPTION) << "probs dims: " << probs_shape.size() << " not support."; | |||||
| } | |||||
| probs_dims_[0] = probs_shape[0]; | |||||
| probs_dims_[1] = probs_shape[1]; | |||||
| probs_dims_[2] = probs_shape[2]; | |||||
| auto indice_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); | |||||
| auto labels_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2); | |||||
| if (labels_dims.size() != 1) { | |||||
| MS_LOG(EXCEPTION) << "labels dims: " << labels_dims.size() << " not support."; | |||||
| } | |||||
| if (indice_dims.size() != 2) { | |||||
| MS_LOG(EXCEPTION) << "labels indice dims: " << indice_dims.size() << " not support."; | |||||
| } | |||||
| label_size_ = sizeof(int); | |||||
| for (auto i : labels_dims) { | |||||
| label_size_ *= i; | |||||
| } | |||||
| label_indice_size_ = sizeof(int64_t); | |||||
| for (auto i : indice_dims) { | |||||
| label_indice_size_ *= i; | |||||
| } | |||||
| auto squence_length_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3); | |||||
| squence_lengths_size_ = squence_length_dims[0] * sizeof(int); | |||||
| preprocess_collapse_repeated_ = GetAttr<bool>(kernel_node, "preprocess_collapse_repeated"); | |||||
| ctc_merge_repeated_ = GetAttr<bool>(kernel_node, "ctc_merge_repeated"); | |||||
| ignore_longer_outputs_than_inputs_ = GetAttr<bool>(kernel_node, "ignore_longer_outputs_than_inputs"); | |||||
| InitSizeLists(); | |||||
| return true; | |||||
| } | |||||
| protected: | |||||
| void LaunchInit(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||||
| const std::vector<AddressPtr> &outputs) { | |||||
| probs = GetDeviceAddress<T>(inputs, 0); | |||||
| label_indices = GetDeviceAddress<int64_t>(inputs, 1); | |||||
| label_values = GetDeviceAddress<int>(inputs, 2); | |||||
| sequence_length = GetDeviceAddress<int>(inputs, 3); | |||||
| costs = GetDeviceAddress<T>(outputs, 0); | |||||
| grads = GetDeviceAddress<T>(outputs, 1); | |||||
| softmax_probs = GetDeviceAddress<T>(workspace, 0); | |||||
| cum_labels_length = GetDeviceAddress<int>(workspace, 1); | |||||
| label_squence_length = GetDeviceAddress<int>(workspace, 2); | |||||
| label_value_sp = GetDeviceAddress<int>(workspace, 3); | |||||
| label_value_pcr = GetDeviceAddress<int>(workspace, 4); | |||||
| prob_num = GetDeviceAddress<T>(workspace, 5); | |||||
| precum_labels_length = GetDeviceAddress<int>(workspace, 6); | |||||
| max_labels_length = GetDeviceAddress<int>(workspace, 7); | |||||
| numclass = SizeToInt(probs_dims_[2]); | |||||
| batch = SizeToInt(probs_dims_[1]); | |||||
| max_time = SizeToInt(probs_dims_[0]); | |||||
| max_sequence = 0; | |||||
| max_labels_length_host = 0; | |||||
| batch_label = 0; | |||||
| label_value_with_blank = nullptr; | |||||
| log_alpha_b = nullptr; | |||||
| log_beta_b = nullptr; | |||||
| } | |||||
| void LaunchFirstHalf(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | |||||
| cudaStream_t stream = reinterpret_cast<cudaStream_t>(stream_ptr); | cudaStream_t stream = reinterpret_cast<cudaStream_t>(stream_ptr); | ||||
| const T *probs = GetDeviceAddress<T>(inputs, 0); | |||||
| const int64_t *label_indices = GetDeviceAddress<int64_t>(inputs, 1); | |||||
| const int *label_values = GetDeviceAddress<int>(inputs, 2); | |||||
| const int *sequence_length = GetDeviceAddress<int>(inputs, 3); | |||||
| T *costs = GetDeviceAddress<T>(outputs, 0); | |||||
| T *grads = GetDeviceAddress<T>(outputs, 1); | |||||
| T *softmax_probs = GetDeviceAddress<T>(workspace, 0); | |||||
| int *cum_labels_length = GetDeviceAddress<int>(workspace, 1); | |||||
| int *label_squence_length = GetDeviceAddress<int>(workspace, 2); | |||||
| int *label_value_sp = GetDeviceAddress<int>(workspace, 3); | |||||
| int *label_value_pcr = GetDeviceAddress<int>(workspace, 4); | |||||
| T *prob_num = GetDeviceAddress<T>(workspace, 5); | |||||
| int *precum_labels_length = GetDeviceAddress<int>(workspace, 6); | |||||
| int *max_labels_length = GetDeviceAddress<int>(workspace, 7); | |||||
| int numclass = SizeToInt(probs_dims_[2]); | |||||
| int batch = SizeToInt(probs_dims_[1]); | |||||
| int max_time = SizeToInt(probs_dims_[0]); | |||||
| int max_sequence = 0; | |||||
| CalculateMaxSequence(sequence_length, max_labels_length, batch, stream); | CalculateMaxSequence(sequence_length, max_labels_length, batch, stream); | ||||
| CHECK_CUDA_RET_WITH_EXCEPT( | CHECK_CUDA_RET_WITH_EXCEPT( | ||||
| cudaMemcpyAsync(&max_sequence, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream), | cudaMemcpyAsync(&max_sequence, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream), | ||||
| @@ -73,11 +124,7 @@ class CtcLossGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| InnerSoftMax(probs, softmax_probs, sequence_length, max_time, batch, numclass, stream); | InnerSoftMax(probs, softmax_probs, sequence_length, max_time, batch, numclass, stream); | ||||
| MemsetForWS(label_value_pcr, cum_labels_length, label_squence_length, costs, grads, stream); | MemsetForWS(label_value_pcr, cum_labels_length, label_squence_length, costs, grads, stream); | ||||
| int max_labels_length_host = 0; | |||||
| int batch_label = 0; | |||||
| int *label_value_with_blank = nullptr; | |||||
| T *log_alpha_b = nullptr; | |||||
| T *log_beta_b = nullptr; | |||||
| CalculatePreLength(label_squence_length, precum_labels_length, cum_labels_length, max_labels_length, label_indices, | CalculatePreLength(label_squence_length, precum_labels_length, cum_labels_length, max_labels_length, label_indices, | ||||
| batch, label_size_ / sizeof(int), stream); | batch, label_size_ / sizeof(int), stream); | ||||
| CHECK_CUDA_RET_WITH_EXCEPT( | CHECK_CUDA_RET_WITH_EXCEPT( | ||||
| @@ -97,8 +144,14 @@ class CtcLossGpuKernel : public GpuKernel { | |||||
| cudaMemcpyAsync(&max_labels_length_host, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream), | cudaMemcpyAsync(&max_labels_length_host, max_labels_length, sizeof(int), cudaMemcpyDeviceToHost, stream), | ||||
| "cudaMemcpyAsync failed."); | "cudaMemcpyAsync failed."); | ||||
| CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); | CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); | ||||
| } | |||||
| void LaunchSecondHalf(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | |||||
| cudaStream_t stream = reinterpret_cast<cudaStream_t>(stream_ptr); | |||||
| int SOffSet = 2 * max_labels_length_host + 1; | int SOffSet = 2 * max_labels_length_host + 1; | ||||
| int log_prob_size = batch * SOffSet * max_time; | int log_prob_size = batch * SOffSet * max_time; | ||||
| if (!ignore_longer_outputs_than_inputs_ && max_labels_length_host > max_time) { | if (!ignore_longer_outputs_than_inputs_ && max_labels_length_host > max_time) { | ||||
| MS_LOG(EXCEPTION) << "output size is greater than input size."; | MS_LOG(EXCEPTION) << "output size is greater than input size."; | ||||
| } | } | ||||
| @@ -124,43 +177,8 @@ class CtcLossGpuKernel : public GpuKernel { | |||||
| ignore_longer_outputs_than_inputs_, stream); | ignore_longer_outputs_than_inputs_, stream); | ||||
| CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); | CHECK_CUDA_RET_WITH_EXCEPT(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed."); | ||||
| FreeMem(label_value_with_blank, log_alpha_b, log_beta_b); | FreeMem(label_value_with_blank, log_alpha_b, log_beta_b); | ||||
| return true; | |||||
| } | |||||
| bool Init(const CNodePtr &kernel_node) override { | |||||
| InitResource(); | |||||
| auto probs_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||||
| if (probs_shape.size() != 3) { | |||||
| MS_LOG(EXCEPTION) << "probs dims: " << probs_shape.size() << " not support."; | |||||
| } | |||||
| probs_dims_[0] = probs_shape[0]; | |||||
| probs_dims_[1] = probs_shape[1]; | |||||
| probs_dims_[2] = probs_shape[2]; | |||||
| auto indice_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); | |||||
| auto labels_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2); | |||||
| if (labels_dims.size() != 1) { | |||||
| MS_LOG(EXCEPTION) << "labels dims: " << labels_dims.size() << " not support."; | |||||
| } | |||||
| if (indice_dims.size() != 2) { | |||||
| MS_LOG(EXCEPTION) << "labels indice dims: " << indice_dims.size() << " not support."; | |||||
| } | |||||
| label_size_ = sizeof(int); | |||||
| for (auto i : labels_dims) { | |||||
| label_size_ *= i; | |||||
| } | |||||
| label_indice_size_ = sizeof(int64_t); | |||||
| for (auto i : indice_dims) { | |||||
| label_indice_size_ *= i; | |||||
| } | |||||
| auto squence_length_dims = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3); | |||||
| squence_lengths_size_ = squence_length_dims[0] * sizeof(int); | |||||
| preprocess_collapse_repeated_ = GetAttr<bool>(kernel_node, "preprocess_collapse_repeated"); | |||||
| ctc_merge_repeated_ = GetAttr<bool>(kernel_node, "ctc_merge_repeated"); | |||||
| ignore_longer_outputs_than_inputs_ = GetAttr<bool>(kernel_node, "ignore_longer_outputs_than_inputs"); | |||||
| InitSizeLists(); | |||||
| return true; | |||||
| } | } | ||||
| protected: | |||||
| void InitSizeLists() override { | void InitSizeLists() override { | ||||
| input_size_list_.push_back(probs_dims_[0] * probs_dims_[1] * probs_dims_[2] * sizeof(T)); | input_size_list_.push_back(probs_dims_[0] * probs_dims_[1] * probs_dims_[2] * sizeof(T)); | ||||
| input_size_list_.push_back(label_indice_size_); | input_size_list_.push_back(label_indice_size_); | ||||
| @@ -226,6 +244,31 @@ class CtcLossGpuKernel : public GpuKernel { | |||||
| bool ctc_merge_repeated_; | bool ctc_merge_repeated_; | ||||
| bool ignore_longer_outputs_than_inputs_; | bool ignore_longer_outputs_than_inputs_; | ||||
| T kLogZero_ = -std::numeric_limits<T>::infinity(); | T kLogZero_ = -std::numeric_limits<T>::infinity(); | ||||
| // Heap parameter | |||||
| T *probs; | |||||
| int64_t *label_indices; | |||||
| int *label_values; | |||||
| int *sequence_length; | |||||
| T *costs; | |||||
| T *grads; | |||||
| T *softmax_probs; | |||||
| int *cum_labels_length; | |||||
| int *label_squence_length; | |||||
| int *label_value_sp; | |||||
| int *label_value_pcr; | |||||
| T *prob_num; | |||||
| int *precum_labels_length; | |||||
| int *max_labels_length; | |||||
| int numclass; | |||||
| int batch; | |||||
| int max_time; | |||||
| int max_sequence; | |||||
| int max_labels_length_host; | |||||
| int batch_label; | |||||
| int *label_value_with_blank; | |||||
| T *log_alpha_b; | |||||
| T *log_beta_b; | |||||
| }; // namespace kernel | }; // namespace kernel | ||||
| } // namespace kernel | } // namespace kernel | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -82,17 +82,11 @@ class L2NormalizeGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| bool Init(const CNodePtr &kernel_node) override { | bool Init(const CNodePtr &kernel_node) override { | ||||
| InitResource(); | InitResource(); | ||||
| data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); | data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); | ||||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||||
| if (input_num != 1) { | |||||
| MS_LOG(ERROR) << "Input number is " << input_num << ", but l2normalize op needs 1 inputs."; | |||||
| return false; | |||||
| } | |||||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||||
| if (output_num != 1) { | |||||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but l2normalize op needs 1 output."; | |||||
| if (!CheckIONumber(kernel_node)) { | |||||
| return false; | return false; | ||||
| } | } | ||||
| int input_dim_length = SizeToInt(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0).size()); | int input_dim_length = SizeToInt(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0).size()); | ||||
| @@ -176,6 +170,19 @@ class L2NormalizeGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| bool CheckIONumber(const CNodePtr &kernel_node) { | |||||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||||
| if (input_num != 1) { | |||||
| MS_LOG(ERROR) << "Input number is " << input_num << ", but l2normalize op needs 1 inputs."; | |||||
| return false; | |||||
| } | |||||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||||
| if (output_num != 1) { | |||||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but l2normalize op needs 1 output."; | |||||
| return false; | |||||
| } | |||||
| return true; | |||||
| } | |||||
| void DestroyResource() noexcept { | void DestroyResource() noexcept { | ||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_), | CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyReduceTensorDescriptor(reduce_tensor_descriptor_), | ||||
| "cudnnDestroyReduceTensorDescriptor failed."); | "cudnnDestroyReduceTensorDescriptor failed."); | ||||
| @@ -104,20 +104,31 @@ class L2NormalizeGradGpuKernel : public GpuKernel { | |||||
| return true; | return true; | ||||
| } | } | ||||
| bool CheckInputShape(const std::vector<size_t> &output_shape) { | |||||
| for (auto &shape : input_shape_list_) { | |||||
| if (output_shape != shape) { | |||||
| MS_LOG(EXCEPTION) << "Input shape and output shape should be same!"; | |||||
| } | |||||
| } | |||||
| is_null_input_ = CHECK_NULL_INPUT(input_shape_list_[0]); | |||||
| if (is_null_input_) { | |||||
| MS_LOG(WARNING) << "L2NormalizeGPUKernel input is null"; | |||||
| InitSizeLists(); | |||||
| return false; | |||||
| } | |||||
| if (input_shape_list_[0].size() > MAX_DIMS) { | |||||
| MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 7"; | |||||
| } | |||||
| return true; | |||||
| } | |||||
| bool Init(const CNodePtr &kernel_node) override { | bool Init(const CNodePtr &kernel_node) override { | ||||
| InitResource(); | InitResource(); | ||||
| data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); | data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); | ||||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||||
| if (input_num != INPUT_SIZE) { | |||||
| MS_LOG(ERROR) << "Input number is " << input_num << ", but l2normalize op needs 3 inputs."; | |||||
| if (!CheckIONumber(kernel_node)) { | |||||
| return false; | return false; | ||||
| } | } | ||||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||||
| if (output_num != 1) { | |||||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but l2normalize op needs 1 output."; | |||||
| return false; | |||||
| } | |||||
| int input_dim_length = SizeToInt(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0).size()); | int input_dim_length = SizeToInt(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0).size()); | ||||
| int axis = static_cast<int>(GetAttr<int64_t>(kernel_node, "axis")); | int axis = static_cast<int>(GetAttr<int64_t>(kernel_node, "axis")); | ||||
| axis_ = axis < 0 ? (axis + input_dim_length) : axis; | axis_ = axis < 0 ? (axis + input_dim_length) : axis; | ||||
| @@ -128,10 +139,8 @@ class L2NormalizeGradGpuKernel : public GpuKernel { | |||||
| input_shape_list_.emplace_back(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i)); | input_shape_list_.emplace_back(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i)); | ||||
| } | } | ||||
| auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); | auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); | ||||
| for (auto &shape : input_shape_list_) { | |||||
| if (output_shape != shape) { | |||||
| MS_LOG(EXCEPTION) << "Input shape and output shape should be same!"; | |||||
| } | |||||
| if (!CheckInputShape(output_shape)) { | |||||
| return true; | |||||
| } | } | ||||
| output_size_ = sizeof(T); | output_size_ = sizeof(T); | ||||
| @@ -139,16 +148,6 @@ class L2NormalizeGradGpuKernel : public GpuKernel { | |||||
| output_size_ *= dim; | output_size_ *= dim; | ||||
| } | } | ||||
| is_null_input_ = CHECK_NULL_INPUT(input_shape_list_[0]); | |||||
| if (is_null_input_) { | |||||
| MS_LOG(WARNING) << "L2NormalizeGPUKernel input is null"; | |||||
| InitSizeLists(); | |||||
| return true; | |||||
| } | |||||
| if (input_shape_list_[0].size() > MAX_DIMS) { | |||||
| MS_LOG(EXCEPTION) << "Broadcast operation not support dim greater than 7"; | |||||
| } | |||||
| std::vector<size_t> output_reduce_shape = output_shape; | std::vector<size_t> output_reduce_shape = output_shape; | ||||
| output_reduce_shape[axis_] = 1; | output_reduce_shape[axis_] = 1; | ||||
| @@ -173,6 +172,19 @@ class L2NormalizeGradGpuKernel : public GpuKernel { | |||||
| } | } | ||||
| protected: | protected: | ||||
| bool CheckIONumber(const CNodePtr &kernel_node) { | |||||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||||
| if (input_num != INPUT_SIZE) { | |||||
| MS_LOG(ERROR) << "Input number is " << input_num << ", but l2normalize op needs 3 inputs."; | |||||
| return false; | |||||
| } | |||||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||||
| if (output_num != 1) { | |||||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but l2normalize op needs 1 output."; | |||||
| return false; | |||||
| } | |||||
| return true; | |||||
| } | |||||
| void InitResource() override { | void InitResource() override { | ||||
| cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | cudnn_handle_ = device::gpu::GPUDeviceManager::GetInstance().GetCudnnHandle(); | ||||
| CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_), | CHECK_CUDNN_RET_WITH_EXCEPT(cudnnCreateReduceTensorDescriptor(&reduce_tensor_descriptor_), | ||||
| @@ -53,14 +53,7 @@ class PadGpuFwdKernel : public GpuKernel { | |||||
| } | } | ||||
| bool Init(const CNodePtr &kernel_node) override { | bool Init(const CNodePtr &kernel_node) override { | ||||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||||
| if (input_num != 1) { | |||||
| MS_LOG(ERROR) << "Input number is " << input_num << ", but Pad needs 1 input."; | |||||
| return false; | |||||
| } | |||||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||||
| if (output_num != 1) { | |||||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but Pad needs 1 output."; | |||||
| if (!CheckIONumber(kernel_node)) { | |||||
| return false; | return false; | ||||
| } | } | ||||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | ||||
| @@ -114,6 +107,20 @@ class PadGpuFwdKernel : public GpuKernel { | |||||
| } | } | ||||
| private: | private: | ||||
| bool CheckIONumber(const CNodePtr &kernel_node) { | |||||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||||
| if (input_num != 1) { | |||||
| MS_LOG(ERROR) << "Input number is " << input_num << ", but Pad needs 1 input."; | |||||
| return false; | |||||
| } | |||||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||||
| if (output_num != 1) { | |||||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but Pad needs 1 output."; | |||||
| return false; | |||||
| } | |||||
| return true; | |||||
| } | |||||
| size_t shape_size_; | size_t shape_size_; | ||||
| size_t temp; | size_t temp; | ||||
| std::vector<std::vector<int>> paddings; // list of paddings (tuple of tuple in python) | std::vector<std::vector<int>> paddings; // list of paddings (tuple of tuple in python) | ||||
| @@ -76,11 +76,9 @@ class PoolingGradGpuKernel : public GpuKernel { | |||||
| "cudnnPoolingBackward failed"); | "cudnnPoolingBackward failed"); | ||||
| return true; | return true; | ||||
| } | } | ||||
| bool Init(const CNodePtr &kernel_node) override { | |||||
| InitResource(); | |||||
| if (!CheckParam(kernel_node)) { | |||||
| return false; | |||||
| } | |||||
| bool InitShape(const CNodePtr &kernel_node, int *dimA, int *strideAin, int *dimAy, int *strideAiny, int *dimAdy, | |||||
| int *strideAdy, int *dimAout, int *strideAout) { | |||||
| auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | auto input_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0); | ||||
| auto input_mask = AnfAlgo::GetInputDeviceShape(kernel_node, 1); | auto input_mask = AnfAlgo::GetInputDeviceShape(kernel_node, 1); | ||||
| auto dout_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2); | auto dout_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 2); | ||||
| @@ -95,9 +93,25 @@ class PoolingGradGpuKernel : public GpuKernel { | |||||
| if (is_null_input_) { | if (is_null_input_) { | ||||
| MS_LOG(WARNING) << "PoolingGradGpuKernel input is null."; | MS_LOG(WARNING) << "PoolingGradGpuKernel input is null."; | ||||
| InitSizeLists(); | InitSizeLists(); | ||||
| return true; | |||||
| return false; | |||||
| } | } | ||||
| SetNCHW(input_shape, &n_, &c_, &old_height_, &old_width_, data_format); | SetNCHW(input_shape, &n_, &c_, &old_height_, &old_width_, data_format); | ||||
| SetDimA(input_shape, dimA, 4, data_format); | |||||
| SetStrideA(input_shape, strideAin, 4, data_format); | |||||
| SetDimA(input_mask, dimAy, 4, data_format); | |||||
| SetStrideA(input_mask, strideAiny, 4, data_format); | |||||
| SetDimA(dout_shape, dimAdy, 4, data_format); | |||||
| SetStrideA(dout_shape, strideAdy, 4, data_format); | |||||
| SetDimA(output_shape, dimAout, 4, data_format); | |||||
| SetStrideA(output_shape, strideAout, 4, data_format); | |||||
| return true; | |||||
| } | |||||
| bool Init(const CNodePtr &kernel_node) override { | |||||
| InitResource(); | |||||
| if (!CheckParam(kernel_node)) { | |||||
| return false; | |||||
| } | |||||
| const int nbDims = 4; | const int nbDims = 4; | ||||
| int dimA[4]; | int dimA[4]; | ||||
| int strideAin[4]; | int strideAin[4]; | ||||
| @@ -107,14 +121,9 @@ class PoolingGradGpuKernel : public GpuKernel { | |||||
| int strideAdy[4]; | int strideAdy[4]; | ||||
| int dimAout[4]; | int dimAout[4]; | ||||
| int strideAout[4]; | int strideAout[4]; | ||||
| SetDimA(input_shape, dimA, 4, data_format); | |||||
| SetStrideA(input_shape, strideAin, 4, data_format); | |||||
| SetDimA(input_mask, dimAy, 4, data_format); | |||||
| SetStrideA(input_mask, strideAiny, 4, data_format); | |||||
| SetDimA(dout_shape, dimAdy, 4, data_format); | |||||
| SetStrideA(dout_shape, strideAdy, 4, data_format); | |||||
| SetDimA(output_shape, dimAout, 4, data_format); | |||||
| SetStrideA(output_shape, strideAout, 4, data_format); | |||||
| if (!InitShape(kernel_node, dimA, strideAin, dimAy, strideAiny, dimAdy, strideAdy, dimAout, strideAout)) { | |||||
| return true; | |||||
| } | |||||
| CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(y_descriptor_, cudnn_data_type_, nbDims, dimAy, strideAiny), | CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(y_descriptor_, cudnn_data_type_, nbDims, dimAy, strideAiny), | ||||
| "cudnnSetTensor4dDescriptor failed"); | "cudnnSetTensor4dDescriptor failed"); | ||||
| CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(dy_descriptor_, cudnn_data_type_, nbDims, dimAdy, strideAdy), | CHECK_CUDNN_RET_WITH_EXCEPT(cudnnSetTensorNdDescriptor(dy_descriptor_, cudnn_data_type_, nbDims, dimAdy, strideAdy), | ||||
| @@ -129,6 +138,7 @@ class PoolingGradGpuKernel : public GpuKernel { | |||||
| InitSizeLists(); | InitSizeLists(); | ||||
| return true; | return true; | ||||
| } | } | ||||
| void DestroyResource() noexcept override { | void DestroyResource() noexcept override { | ||||
| CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_), | CHECK_CUDNN_RET_WITH_ERROR(cudnnDestroyPoolingDescriptor(pooling_descriptor_), | ||||
| "cudnnDestroyPoolingDescriptor failed"); | "cudnnDestroyPoolingDescriptor failed"); | ||||