!4479 Fix gpu registration macro to work with type unsigned char

Merge pull request !4479 from Peilin/gpu-reg-macro-fix
5 years ago · ca7ec00c71
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.cc
@@ -32,7 +32,7 @@ MS_REG_GPU_KERNEL_ONE(Concat,
                      ConcatV2GpuFwdKernel, short)  // NOLINT
 MS_REG_GPU_KERNEL_ONE(Concat,
                      KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
                      ConcatV2GpuFwdKernel, char)
                      ConcatV2GpuFwdKernel, uchar)
 MS_REG_GPU_KERNEL_ONE(Concat,
                      KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
                      ConcatV2GpuFwdKernel, bool)
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gathernd_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/gathernd_gpu_kernel.cc
@@ -34,7 +34,7 @@ MS_REG_GPU_KERNEL_TWO(
  GatherNdGpuFwdKernel, short, int)  // NOLINT
 MS_REG_GPU_KERNEL_TWO(
  GatherNd, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeUInt8),
  GatherNdGpuFwdKernel, char, int)
  GatherNdGpuFwdKernel, uchar, int)
 MS_REG_GPU_KERNEL_TWO(
  GatherNd, KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool),
  GatherNdGpuFwdKernel, bool, int)
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/strided_slice_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/strided_slice_gpu_kernel.cc
@@ -27,7 +27,7 @@ MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeInt32).
 MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
                      StridedSliceGpuKernel, short)  // NOLINT
 MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
                      StridedSliceGpuKernel, char)
                      StridedSliceGpuKernel, uchar)
 MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
                      StridedSliceGpuKernel, bool)
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/strided_slice_grad_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/strided_slice_grad_gpu_kernel.cc
@@ -27,7 +27,7 @@ MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeInt
 MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
                      StridedSliceGradGpuKernel, short)  // NOLINT
 MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
                      StridedSliceGradGpuKernel, char)
                      StridedSliceGradGpuKernel, uchar)
 MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
                      StridedSliceGradGpuKernel, bool)
 }  // namespace kernel
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/check_valid_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/check_valid_impl.cu
@@ -37,7 +37,8 @@ __global__ void CheckValidKernel(const size_t size, const T *box, const T *img_m
 }

 template <typename S>
 __global__ void CheckValidKernel(const size_t size, const char *box, const char *img_metas, S *valid) {
 __global__ void CheckValidKernel(const size_t size, const unsigned char *box,
                                 const unsigned char *img_metas, S *valid) {
  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
    const size_t left_x = i * 4;
    const size_t left_y = i * 4 + 1;
@@ -45,10 +46,8 @@ __global__ void CheckValidKernel(const size_t size, const char *box, const char
    const size_t right_y = i * 4 + 3;

    S valid_flag = false;
    valid_flag |= !((unsigned int)box[left_x] >= 0);
    valid_flag |= !((unsigned int)box[left_y] >= 0);
    valid_flag |= !((unsigned int)img_metas[0] * (unsigned int)img_metas[2] - 1 >= (unsigned int)box[right_x]);
    valid_flag |= !((unsigned int)img_metas[1] * (unsigned int)img_metas[2] - 1 >= (unsigned int)box[right_y]);
    valid_flag |= !(img_metas[0] * img_metas[2] >= box[right_x] + 1);
    valid_flag |= !(img_metas[1] * img_metas[2] >= box[right_y] + 1);

    valid[i] = !valid_flag;
  }
@@ -67,5 +66,5 @@ template void CheckValid(const size_t &size, const half *box, const half *img_me
                         cudaStream_t cuda_stream);
 template void CheckValid(const size_t &size, const short *box, const short *img_metas, bool *valid,  // NOLINT
                         cudaStream_t cuda_stream);
 template void CheckValid(const size_t &size, const char *box, const char *img_metas, bool *valid,
 template void CheckValid(const size_t &size, const unsigned char *box, const unsigned char *img_metas, bool *valid,
                         cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cu
@@ -73,7 +73,7 @@ template void ConcatKernel(const size_t size, const int input_num,
                           cudaStream_t cuda_stream);
 template void ConcatKernel(const size_t size, const int input_num,
                           const int all_size_before_axis, const int all_size_axis,
                           int* len_axis, char** inputs, char* output,
                           int* len_axis, unsigned char** inputs, unsigned char* output,
                           cudaStream_t cuda_stream);
 template void ConcatKernel(const size_t size, const int input_num,
                           const int all_size_before_axis, const int all_size_axis,
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/gathernd.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/gathernd.cu
@@ -64,11 +64,12 @@ template void GatherNd<int, int>(int *input, int *indices, int *output, const si
                                 const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices,
                                 int *batch_strides, cudaStream_t stream);
 template void GatherNd<short, int>(short *input, int *indices, short *output, const size_t &output_dim0,  // NOLINT
                                  const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices,
                                  int *batch_strides, cudaStream_t stream);
 template void GatherNd<char, int>(char *input, int *indices, char *output, const size_t &output_dim0,
                                  const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices,
                                  int *batch_strides, cudaStream_t stream);
                                   const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices,
                                   int *batch_strides, cudaStream_t stream);
 template void GatherNd<unsigned char, int>(unsigned char *input, int *indices, unsigned char *output,
                                           const size_t &output_dim0, const size_t &output_dim1,
                                           const size_t &indices_dim1, int *batch_indices, int *batch_strides,
                                           cudaStream_t stream);
 template void GatherNd<bool, int>(bool *input, int *indices, bool *output, const size_t &output_dim0,
                                  const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices,
                                  int *batch_strides, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/slice_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/slice_impl.cu
@@ -180,8 +180,7 @@ template void CalSliceGrad<int>(const size_t input_size, const int *dy, const st
                                const std::vector<int> begin, const std::vector<int> size, int *output,
                                cudaStream_t cuda_stream);

 // NOLINTNEXTLINE
 template void FillDeviceArray<short>(const size_t input_size, short *addr, const float value, cudaStream_t cuda_stream);
 template void FillDeviceArray<short>(const size_t input_size, short *addr, const float value, cudaStream_t cuda_stream);  // NOLINT
 template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2,
                            const int l3, const int l4, const int d1, const int d2, const int d3, const int d4,
                            const short *input, short *output, cudaStream_t stream);  // NOLINT
@@ -189,13 +188,14 @@ template void CalSliceGrad<short>(const size_t input_size, const short *dy, cons
                                const std::vector<int> begin, const std::vector<int> size, short *output,  // NOLINT
                                cudaStream_t cuda_stream);

 template void FillDeviceArray<char>(const size_t input_size, char *addr, const float value, cudaStream_t cuda_stream);
 template void FillDeviceArray<unsigned char>(const size_t input_size, unsigned char *addr, const float value,
                                             cudaStream_t cuda_stream);
 template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2,
                            const int l3, const int l4, const int d1, const int d2, const int d3, const int d4,
                            const char *input, char *output, cudaStream_t stream);
 template void CalSliceGrad<char>(const size_t input_size, const char *dy, const std::vector<int> in_shape,
                                const std::vector<int> begin, const std::vector<int> size, char *output,
                                cudaStream_t cuda_stream);
                            const unsigned char *input, unsigned char *output, cudaStream_t stream);
 template void CalSliceGrad<unsigned char>(const size_t input_size, const unsigned char *dy,
                                          const std::vector<int> in_shape, const std::vector<int> begin,
                                          const std::vector<int> size, unsigned char *output, cudaStream_t cuda_stream);

 template void FillDeviceArray<bool>(const size_t input_size, bool *addr, const float value, cudaStream_t cuda_stream);
 template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2,
@@ -215,12 +215,11 @@ template void StridedSlice(const std::vector<size_t> &input_shape, const std::ve
                           const std::vector<int> &strides, const std::vector<int> &output_shape, const int *input,
                           int *output, cudaStream_t cuda_stream);
 template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin,
                           // NOLINTNEXTLINE
                           const std::vector<int> &strides, const std::vector<int> &output_shape, const short *input,
                           short *output, cudaStream_t cuda_stream);  // NOLINT
                           const std::vector<int> &strides, const std::vector<int> &output_shape,
                           const short *input, short *output, cudaStream_t cuda_stream);  // NOLINT
 template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin,
                           const std::vector<int> &strides, const std::vector<int> &output_shape, const char *input,
                           char *output, cudaStream_t cuda_stream);
                           const std::vector<int> &strides, const std::vector<int> &output_shape,
                           const unsigned char *input, unsigned char *output, cudaStream_t cuda_stream);
 template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin,
                           const std::vector<int> &strides, const std::vector<int> &output_shape, const bool *input,
                           bool *output, cudaStream_t cuda_stream);
@@ -235,12 +234,11 @@ template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vect
                               const std::vector<int> &strides, const std::vector<int> &dx_shape, const int *dy,
                               int *dx, cudaStream_t cuda_stream);
 template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin,
                               // NOLINTNEXTLINE
                               const std::vector<int> &strides, const std::vector<int> &dx_shape, const short *dy,
                               const std::vector<int> &strides, const std::vector<int> &dx_shape, const short *dy,  // NOLINT
                               short *dx, cudaStream_t cuda_stream);  // NOLINT
 template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin,
                               const std::vector<int> &strides, const std::vector<int> &dx_shape, const char *dy,
                               char *dx, cudaStream_t cuda_stream);
                               const std::vector<int> &strides, const std::vector<int> &dx_shape,
                               const unsigned char *dy, unsigned char *dx, cudaStream_t cuda_stream);
 template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin,
                               const std::vector<int> &strides, const std::vector<int> &dx_shape, const bool *dy,
                               bool *dx, cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h
@@ -64,6 +64,14 @@ class GpuKernelRegister {
  }
 };

 // This is necessary for gpu kernels to support uint8 data type. In cuda, an unsigned,
 // 8 bit integral type is represented by an unsigned char, but the MS_REG_GPU_KERNEL
 // macros defined below will create compilation errors when datatype T contains a space,
 // because the variable created by the macro will also contain a space. So, we solve this
 // problem by writing uchar when calling these macros, and expanding uchar after the
 // variable has been created.
 #define uchar unsigned char

 #define MS_REG_GPU_KERNEL(OPNAME, OPCLASS)                                                 \
  static_assert(std::is_base_of<GpuKernel, OPCLASS>::value, " must be base of GpuKernel"); \
  static const GpuKernelRegister g_##OPNAME##_gpu_kernel_reg(#OPNAME, KernelAttr(), []() { return new OPCLASS(); });
@@ -88,7 +96,6 @@ class GpuKernelRegister {
  static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S>>::value, " must be base of GpuKernel"); \
  static const GpuKernelRegister g_##OPNAME##_##T##_##S##_gpu_kernel_reg(#OPNAME, ATTR,          \
                                                                         []() { return new OPCLASS<T, S>(); });

 // register of mixed accuracy kernels which use template and maintain three typename
 #define MS_REG_GPU_KERNEL_THREE(OPNAME, ATTR, OPCLASS, T, S, G)                                     \
  static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S, G>>::value, " must be base of GpuKernel"); \
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/other/check_valid_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/other/check_valid_gpu_kernel.cc
@@ -31,6 +31,6 @@ MS_REG_GPU_KERNEL_TWO(
  CheckValidGpuKernel, short, bool)  // NOLINT
 MS_REG_GPU_KERNEL_TWO(
  CheckValid, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeBool),
  CheckValidGpuKernel, char, bool)
  CheckValidGpuKernel, uchar, bool)
 }  // namespace kernel
 }  // namespace mindspore