add fix to float64 support in reduce ops

5 years ago · ec644a5291
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h
@@ -50,8 +50,8 @@ class ArrayReduceGpuKernel : public GpuKernel {
    T *output_addr = GetDeviceAddress<T>(outputs, 0);
    T *workspace_addr = GetDeviceAddress<T>(workspace, 0);

    const float alpha = 1;
    const float beta = 0;
    T alpha = static_cast<T>(1.0f);
    T beta = static_cast<T>(0.0f);
    if (all_match_) {
      MS_LOG(DEBUG)
        << "The corresponding dimensions of the input and output tensors all match. No need to call cuDNN kernel.";
@@ -60,11 +60,21 @@ class ArrayReduceGpuKernel : public GpuKernel {
                                                 reinterpret_cast<cudaStream_t>(stream_ptr)),
                                 "cudaMemcpyAsync failed in ArrayReduceGpuKernel::Launch.");
    } else {
      CHECK_CUDNN_RET_WITH_EXCEPT(
        kernel_node_,
        cudnnReduceTensor(cudnn_handle_, reduce_tensor_descriptor_, nullptr, 0, workspace_addr, workspace_size_, &alpha,
                          inputA_descriptor_, input_addr, &beta, outputC_descriptor_, output_addr),
        "cudnnReduceTensor failed.");
      if (data_type_ == CUDNN_DATA_DOUBLE) {
        CHECK_CUDNN_RET_WITH_EXCEPT(
          kernel_node_,
          cudnnReduceTensor(cudnn_handle_, reduce_tensor_descriptor_, nullptr, 0, workspace_addr, workspace_size_,
                            &alpha, inputA_descriptor_, input_addr, &beta, outputC_descriptor_, output_addr),
          "cudnnReduceTensor failed.");
      } else {
        const float alphaf = static_cast<float>(alpha);
        const float betaf = static_cast<float>(beta);
        CHECK_CUDNN_RET_WITH_EXCEPT(
          kernel_node_,
          cudnnReduceTensor(cudnn_handle_, reduce_tensor_descriptor_, nullptr, 0, workspace_addr, workspace_size_,
                            &alphaf, inputA_descriptor_, input_addr, &betaf, outputC_descriptor_, output_addr),
          "cudnnReduceTensor failed.");
      }
    }
    return true;
  }
@@ -194,12 +204,12 @@ class ArrayReduceGpuKernel : public GpuKernel {
      MS_LOG(EXCEPTION) << "Array reduce kernel type " << kernel_name << " is not supported.";
    }
    reduce_tensor_op_ = iter->second;

    CHECK_CUDNN_RET_WITH_EXCEPT(
      kernel_node_,
      cudnnSetReduceTensorDescriptor(reduce_tensor_descriptor_, reduce_tensor_op_, CUDNN_DATA_FLOAT, nan_prop_,
                                     reduce_indices_, CUDNN_32BIT_INDICES),
      "cudnnSetReduceTensorDescriptor failed");
    // add check for float64
    cudnnDataType_t comp_type = (data_type_ == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
    CHECK_CUDNN_RET_WITH_EXCEPT(kernel_node_,
                                cudnnSetReduceTensorDescriptor(reduce_tensor_descriptor_, reduce_tensor_op_, comp_type,
                                                               nan_prop_, reduce_indices_, CUDNN_32BIT_INDICES),
                                "cudnnSetReduceTensorDescriptor failed");
    return;
  }
  void InferInAndOutDesc(const std::vector<size_t> &input_shape, const std::vector<size_t> &output_shape) {
--- a/tests/st/ops/gpu/test_reduce_max_op.py
+++ b/tests/st/ops/gpu/test_reduce_max_op.py
@@ -232,3 +232,12 @@ def test_reduce_max_float64():
    error = np.ones(shape=expect.shape) * 1.0e-5
    assert np.all(diff < error)
    assert output.shape == expect.shape

    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
    net = ReduceMaxTypeNet(np.float64)
    output = net()
    expect = np.max(x0, axis=axis0, keepdims=keep_dims0).astype(np.float64)
    diff = abs(output.asnumpy() - expect)
    error = np.ones(shape=expect.shape) * 1.0e-5
    assert np.all(diff < error)
    assert output.shape == expect.shape
--- a/tests/st/ops/gpu/test_reduce_sum_op.py
+++ b/tests/st/ops/gpu/test_reduce_sum_op.py
@@ -324,3 +324,12 @@ def test_reduce_sum_float64():
    error = np.ones(shape=expect.shape) * 1.0e-5
    assert np.all(diff < error)
    assert output.shape == expect.shape

    context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU")
    net = ReduceSumTypeNet(np.float64)
    output = net()
    expect = np.sum(x0, axis=axis0, keepdims=keep_dims0).astype(np.float64)
    diff = abs(output.asnumpy() - expect)
    error = np.ones(shape=expect.shape) * 1.0e-5
    assert np.all(diff < error)
    assert output.shape == expect.shape