intial commit: cast to float32

add casting to float32 for intopk fix ci fix ci fixed intopk
4 years ago · b55a930215
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/in_top_k_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/in_top_k_gpu_kernel.h
@@ -22,6 +22,7 @@

 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/cast_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/in_top_k_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"

@@ -47,12 +48,29 @@ class InTopKGpuKernel : public GpuKernel {
    T *top_k_output_device = GetDeviceAddress<T>(workspace, 0);
    int32_t *top_k_indices_device = GetDeviceAddress<int32_t>(workspace, 1);

    // topk sorts the input along the last dimension
    FastTopK(outer_size_, inner_size_, predictions_device, static_cast<int32_t>(k_), top_k_output_device,
             top_k_indices_device, top_k_init_, reinterpret_cast<cudaStream_t>(stream_ptr));
    if (std::is_same<T, half>::value) {
      // remove later! urgent fix for bug: topk has incorrect output for float16
      float top_k_init = std::numeric_limits<float>::lowest();

      // cast to float32
      float *casted_float32_input = GetDeviceAddress<float>(workspace, 2);
      float *top_k_output_device_float32 = GetDeviceAddress<float>(workspace, 3);

      Cast(input_size_, predictions_device, casted_float32_input, reinterpret_cast<cudaStream_t>(stream_ptr));

      FastTopK(outer_size_, inner_size_, casted_float32_input, static_cast<int32_t>(k_), top_k_output_device_float32,
               top_k_indices_device, top_k_init, reinterpret_cast<cudaStream_t>(stream_ptr));

      CalInTopK(casted_float32_input, targets_device, output_device, top_k_output_device_float32, input_shape_[0],
                input_shape_[1], k_, reinterpret_cast<cudaStream_t>(stream_ptr));
    } else {
      // topk sorts the input along the last dimension
      FastTopK(outer_size_, inner_size_, predictions_device, static_cast<int32_t>(k_), top_k_output_device,
               top_k_indices_device, top_k_init_, reinterpret_cast<cudaStream_t>(stream_ptr));

    CalInTopK(predictions_device, targets_device, output_device, top_k_output_device, input_shape_[0], k_,
              reinterpret_cast<cudaStream_t>(stream_ptr));
      CalInTopK(predictions_device, targets_device, output_device, top_k_output_device, input_shape_[0],
                input_shape_[1], k_, reinterpret_cast<cudaStream_t>(stream_ptr));
    }

    return true;
  }
@@ -114,6 +132,12 @@ class InTopKGpuKernel : public GpuKernel {
    output_size_list_.push_back(input_shape_[0] * sizeof(bool));
    workspace_size_list_.push_back(input_shape_[0] * k_ * sizeof(T));
    workspace_size_list_.push_back(input_shape_[0] * k_ * sizeof(int32_t));

    // remove later! urgent fix for bug: topk has incorrect output for float16
    if (std::is_same<T, half>::value) {
      workspace_size_list_.push_back(input_size_ * sizeof(float));
      workspace_size_list_.push_back(input_shape_[0] * k_ * sizeof(float));
    }
  }

 private:
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/topk_gpu_kernel.h
@@ -21,6 +21,7 @@
 #include <vector>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/cast_impl.cuh"
 #include "backend/kernel_compiler/gpu/cuda_impl/topk_impl.cuh"

 namespace mindspore {
@@ -42,20 +43,38 @@ class TopKGpuKernel : public GpuKernel {
    T *output_addr = GetDeviceAddress<T>(outputs, 0);
    S *indices = GetDeviceAddress<S>(outputs, 1);

    T init_k = std::numeric_limits<T>::lowest();
    if (std::is_same<T, half>::value) {
      // min value representable by float16, std::numeric_limits doesn't support half
      init_k = static_cast<half>(-65504.);
    }

    S k_cut = 0;
    CHECK_CUDA_RET_WITH_EXCEPT(
      kernel_node_,
      cudaMemcpyAsync(&k_cut, k, sizeof(S), cudaMemcpyDeviceToHost, reinterpret_cast<cudaStream_t>(stream_ptr)),
      "cudaMemcpyAsync k_cut failed");
    CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaDeviceSynchronize(), "cudaDeviceSyncFailed - TopK");
    FastTopK(outer_size_, inner_size_, input_addr, k_cut, output_addr, indices, init_k,
             reinterpret_cast<cudaStream_t>(stream_ptr));

    if (std::is_same<T, half>::value) {
      // remove later! urgent fix for bug: topk has incorrect output for float16
      float init_k = std::numeric_limits<float>::lowest();

      // cast to float32
      float *casted_float32_input = GetDeviceAddress<float>(workspaces, 0);
      float *casted_float32_top_k_output = GetDeviceAddress<float>(workspaces, 1);
      Cast(outer_size_ * inner_size_, input_addr, casted_float32_input, reinterpret_cast<cudaStream_t>(stream_ptr));

      // call FastTopK with workspace[n], workspace[n+1] as input, output
      FastTopK(outer_size_, inner_size_, casted_float32_input, k_cut, casted_float32_top_k_output, indices, init_k,
               reinterpret_cast<cudaStream_t>(stream_ptr));

      // cast workspace[n+1] back to float16
      Cast(outer_size_ * k_, casted_float32_top_k_output, output_addr, reinterpret_cast<cudaStream_t>(stream_ptr));
    } else {
      T init_k = std::numeric_limits<T>::lowest();
      CHECK_CUDA_RET_WITH_EXCEPT(
        kernel_node_,
        cudaMemcpyAsync(&k_cut, k, sizeof(S), cudaMemcpyDeviceToHost, reinterpret_cast<cudaStream_t>(stream_ptr)),
        "cudaMemcpyAsync k_cut failed");
      CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaDeviceSynchronize(), "cudaDeviceSyncFailed - TopK");
      FastTopK(outer_size_, inner_size_, input_addr, k_cut, output_addr, indices, init_k,
               reinterpret_cast<cudaStream_t>(stream_ptr));
    }
    return true;
  }

@@ -82,6 +101,12 @@ class TopKGpuKernel : public GpuKernel {
    input_size_list_.push_back(sizeof(S));
    output_size_list_.push_back(outer_size_ * k_ * sizeof(T));
    output_size_list_.push_back(outer_size_ * k_ * sizeof(S));

    // remove later! urgent fix for bug: topk has incorrect output for float16
    if (std::is_same<T, half>::value) {
      workspace_size_list_.push_back(outer_size_ * inner_size_ * sizeof(float));
      workspace_size_list_.push_back(outer_size_ * k_ * sizeof(float));
    }
  }

 private:
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/in_top_k_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/in_top_k_impl.cu
@@ -21,9 +21,9 @@

 template <typename T>
 __global__ void InTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output,
                       size_t class_id_count, int64_t k) {
                       size_t batch_size, size_t class_id_count, int64_t k) {
  size_t gt_id = blockIdx.x * blockDim.x + threadIdx.x;
  for (; gt_id < class_id_count; gt_id += blockDim.x * gridDim.x) {
  for (; gt_id < batch_size; gt_id += blockDim.x * gridDim.x) {
    int32_t target_index = targets[gt_id];
    T predicted_value = predictions[gt_id * class_id_count + target_index];
    T top_k_smallest_value = top_k_output[k - 1];
@@ -33,14 +33,15 @@ __global__ void InTopK(const T *predictions, const int32_t *targets, bool *outpu
 }

 template <typename T>
 void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output, size_t class_id_count,
               int64_t k, cudaStream_t cuda_stream) {
 void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output, size_t batch_size,
               size_t class_id_count, int64_t k, cudaStream_t cuda_stream) {
  InTopK<<<GET_BLOCKS(class_id_count), GET_THREADS, 0, cuda_stream>>>(predictions, targets, output, top_k_output,
                                                                      class_id_count, k);
                                                                      batch_size, class_id_count, k);
 }

 template void CalInTopK<half>(const half *predictions, const int32_t *targets, bool *output, const half *top_k_output,
                              size_t class_id_count, int64_t k, cudaStream_t cuda_stream);
                              size_t batch_size, size_t class_id_count, int64_t k, cudaStream_t cuda_stream);

 template void CalInTopK<float>(const float *predictions, const int32_t *targets, bool *output,
                               const float *top_k_output, size_t class_id_count, int64_t k, cudaStream_t cuda_stream);
                               const float *top_k_output, size_t batch_size, size_t class_id_count, int64_t k,
                               cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/in_top_k_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/in_top_k_impl.cuh
@@ -20,7 +20,7 @@
 #include <cuda_runtime.h>

 template <typename T>
 void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output, size_t class_id_count,
               int64_t k, cudaStream_t cuda_stream);
 void CalInTopK(const T *predictions, const int32_t *targets, bool *output, const T *top_k_output, size_t batch_size,
               size_t class_id_count, int64_t k, cudaStream_t cuda_stream);

 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_IN_TOP_K_CUH_
--- a/tests/st/ops/gpu/test_in_top_k.py
+++ b/tests/st/ops/gpu/test_in_top_k.py
@@ -32,9 +32,9 @@ class InTopKNet(nn.Cell):
 def in_top_k(nptype):
    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")

    predictions = Tensor(np.array([[9, 3, 8],
                                   [7, 9, 9],
                                   [9, 9, 9]]).astype(nptype))
    predictions = Tensor(np.array([[9, 3, 8, 0, 0, 0, 0, 0, 0],
                                   [7, 9, 9, 0, 0, 0, 0, 0, 0],
                                   [9, 9, 9, 0, 0, 0, 0, 0, 0]]).astype(nptype))

    k = 1
    in_top_k_net = InTopKNet(k)