|
|
|
@@ -34,6 +34,7 @@ class AddNGpuFwdKernel : public GpuKernel { |
|
|
|
: cudnn_handle_(nullptr), |
|
|
|
input_descriptor_(nullptr), |
|
|
|
cudnn_data_type_(CUDNN_DATA_FLOAT), |
|
|
|
is_int64_(false), |
|
|
|
input_size_(0), |
|
|
|
output_size_(0), |
|
|
|
workspace_size_(0), |
|
|
|
@@ -57,7 +58,7 @@ class AddNGpuFwdKernel : public GpuKernel { |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
if (cudnn_data_type_ == CUDNN_DATA_INT32) { |
|
|
|
if (cudnn_data_type_ == CUDNN_DATA_INT32 || is_int64_) { |
|
|
|
FillDeviceArray(outputs[0]->size / sizeof(T), output_addr, 0.0f, reinterpret_cast<cudaStream_t>(stream_ptr)); |
|
|
|
FillDeviceArray(outputs[0]->size / sizeof(T), work_addr, 0.0f, reinterpret_cast<cudaStream_t>(stream_ptr)); |
|
|
|
} |
|
|
|
@@ -67,7 +68,7 @@ class AddNGpuFwdKernel : public GpuKernel { |
|
|
|
const double dbeta = static_cast<double>(0.0f); |
|
|
|
for (size_t i = 0; i < num_input_; i++) { |
|
|
|
T *input_addr = GetDeviceAddress<T>(inputs, i); |
|
|
|
if (cudnn_data_type_ == CUDNN_DATA_INT32) { |
|
|
|
if (cudnn_data_type_ == CUDNN_DATA_INT32 || is_int64_) { |
|
|
|
ElewiseArith(outputs[0]->size / sizeof(T), BROADCAST_TYPE_ADD, input_addr, work_addr, work_addr, |
|
|
|
reinterpret_cast<cudaStream_t>(stream_ptr)); |
|
|
|
} else if (cudnn_data_type_ == CUDNN_DATA_DOUBLE) { |
|
|
|
@@ -92,8 +93,12 @@ class AddNGpuFwdKernel : public GpuKernel { |
|
|
|
} |
|
|
|
bool Init(const CNodePtr &kernel_node) override { |
|
|
|
kernel_node_ = kernel_node; |
|
|
|
// because int64 is not supported in cudnn, so we go separate path |
|
|
|
is_int64_ = (AnfAlgo::GetInputDeviceDataType(kernel_node, 0) == kNumberTypeInt64) ? true : false; |
|
|
|
InitResource(); |
|
|
|
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); |
|
|
|
if (!is_int64_) { |
|
|
|
cudnn_data_type_ = GetCudnnDataType(TypeIdLabel(AnfAlgo::GetInputDeviceDataType(kernel_node, 0))); |
|
|
|
} |
|
|
|
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); |
|
|
|
num_input_ = GetAttr<int64_t>(kernel_node, "n"); |
|
|
|
if (num_input_ != input_num) { |
|
|
|
@@ -112,24 +117,31 @@ class AddNGpuFwdKernel : public GpuKernel { |
|
|
|
InitSizeLists(); |
|
|
|
return true; |
|
|
|
} |
|
|
|
for (size_t i = input_shape.size(); i < 4; i++) { |
|
|
|
(void)input_shape.insert(input_shape.begin(), 1); |
|
|
|
} |
|
|
|
std::vector<int> dimA; |
|
|
|
for (size_t i = 0; i < input_shape.size(); i++) { |
|
|
|
dimA.push_back(SizeToInt(input_shape[i])); |
|
|
|
} |
|
|
|
auto input_format = AnfAlgo::GetInputFormat(kernel_node, 0); |
|
|
|
if (input_format == kOpFormat_NHWC) { |
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, |
|
|
|
cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_, |
|
|
|
SizeToInt(input_shape.size()), dimA.data()), |
|
|
|
"cudnnSetTensorNdDescriptor failed"); |
|
|
|
if (is_int64_) { |
|
|
|
input_size_ = sizeof(T); |
|
|
|
for (size_t i = 0; i < input_shape.size(); i++) { |
|
|
|
input_size_ *= input_shape[i]; |
|
|
|
} |
|
|
|
} else { |
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, |
|
|
|
cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, |
|
|
|
SizeToInt(input_shape.size()), dimA.data()), |
|
|
|
"cudnnSetTensorNdDescriptor failed"); |
|
|
|
for (size_t i = input_shape.size(); i < 4; i++) { |
|
|
|
(void)input_shape.insert(input_shape.begin(), 1); |
|
|
|
} |
|
|
|
std::vector<int> dimA; |
|
|
|
for (size_t i = 0; i < input_shape.size(); i++) { |
|
|
|
dimA.push_back(SizeToInt(input_shape[i])); |
|
|
|
} |
|
|
|
auto input_format = AnfAlgo::GetInputFormat(kernel_node, 0); |
|
|
|
if (input_format == kOpFormat_NHWC) { |
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, |
|
|
|
cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NHWC, cudnn_data_type_, |
|
|
|
SizeToInt(input_shape.size()), dimA.data()), |
|
|
|
"cudnnSetTensorNdDescriptor failed"); |
|
|
|
} else { |
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, |
|
|
|
cudnnSetTensorNdDescriptorEx(input_descriptor_, CUDNN_TENSOR_NCHW, cudnn_data_type_, |
|
|
|
SizeToInt(input_shape.size()), dimA.data()), |
|
|
|
"cudnnSetTensorNdDescriptor failed"); |
|
|
|
} |
|
|
|
} |
|
|
|
InitSizeLists(); |
|
|
|
return true; |
|
|
|
@@ -147,7 +159,7 @@ class AddNGpuFwdKernel : public GpuKernel { |
|
|
|
"cudnnCreateTensorDescriptor failed"); |
|
|
|
} |
|
|
|
void InitSizeLists() override { |
|
|
|
if (!is_null_input_) { |
|
|
|
if (!is_null_input_ && !is_int64_) { |
|
|
|
CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_, cudnnGetTensorSizeInBytes(input_descriptor_, &input_size_), |
|
|
|
"cudnnGetTensorSizeInBytes failed"); |
|
|
|
} |
|
|
|
@@ -167,6 +179,7 @@ class AddNGpuFwdKernel : public GpuKernel { |
|
|
|
std::vector<size_t> output_size_list_; |
|
|
|
std::vector<size_t> workspace_size_list_; |
|
|
|
|
|
|
|
bool is_int64_; |
|
|
|
size_t input_size_; |
|
|
|
size_t output_size_; |
|
|
|
size_t workspace_size_; |
|
|
|
|