Merge pull request !4479 from Peilin/gpu-reg-macro-fixtags/v0.7.0-beta
| @@ -32,7 +32,7 @@ MS_REG_GPU_KERNEL_ONE(Concat, | |||
| ConcatV2GpuFwdKernel, short) // NOLINT | |||
| MS_REG_GPU_KERNEL_ONE(Concat, | |||
| KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), | |||
| ConcatV2GpuFwdKernel, char) | |||
| ConcatV2GpuFwdKernel, uchar) | |||
| MS_REG_GPU_KERNEL_ONE(Concat, | |||
| KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool), | |||
| ConcatV2GpuFwdKernel, bool) | |||
| @@ -34,7 +34,7 @@ MS_REG_GPU_KERNEL_TWO( | |||
| GatherNdGpuFwdKernel, short, int) // NOLINT | |||
| MS_REG_GPU_KERNEL_TWO( | |||
| GatherNd, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeUInt8), | |||
| GatherNdGpuFwdKernel, char, int) | |||
| GatherNdGpuFwdKernel, uchar, int) | |||
| MS_REG_GPU_KERNEL_TWO( | |||
| GatherNd, KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool), | |||
| GatherNdGpuFwdKernel, bool, int) | |||
| @@ -27,7 +27,7 @@ MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeInt32). | |||
| MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16), | |||
| StridedSliceGpuKernel, short) // NOLINT | |||
| MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), | |||
| StridedSliceGpuKernel, char) | |||
| StridedSliceGpuKernel, uchar) | |||
| MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool), | |||
| StridedSliceGpuKernel, bool) | |||
| } // namespace kernel | |||
| @@ -27,7 +27,7 @@ MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeInt | |||
| MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16), | |||
| StridedSliceGradGpuKernel, short) // NOLINT | |||
| MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), | |||
| StridedSliceGradGpuKernel, char) | |||
| StridedSliceGradGpuKernel, uchar) | |||
| MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool), | |||
| StridedSliceGradGpuKernel, bool) | |||
| } // namespace kernel | |||
| @@ -37,7 +37,8 @@ __global__ void CheckValidKernel(const size_t size, const T *box, const T *img_m | |||
| } | |||
| template <typename S> | |||
| __global__ void CheckValidKernel(const size_t size, const char *box, const char *img_metas, S *valid) { | |||
| __global__ void CheckValidKernel(const size_t size, const unsigned char *box, | |||
| const unsigned char *img_metas, S *valid) { | |||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) { | |||
| const size_t left_x = i * 4; | |||
| const size_t left_y = i * 4 + 1; | |||
| @@ -45,10 +46,8 @@ __global__ void CheckValidKernel(const size_t size, const char *box, const char | |||
| const size_t right_y = i * 4 + 3; | |||
| S valid_flag = false; | |||
| valid_flag |= !((unsigned int)box[left_x] >= 0); | |||
| valid_flag |= !((unsigned int)box[left_y] >= 0); | |||
| valid_flag |= !((unsigned int)img_metas[0] * (unsigned int)img_metas[2] - 1 >= (unsigned int)box[right_x]); | |||
| valid_flag |= !((unsigned int)img_metas[1] * (unsigned int)img_metas[2] - 1 >= (unsigned int)box[right_y]); | |||
| valid_flag |= !(img_metas[0] * img_metas[2] >= box[right_x] + 1); | |||
| valid_flag |= !(img_metas[1] * img_metas[2] >= box[right_y] + 1); | |||
| valid[i] = !valid_flag; | |||
| } | |||
| @@ -67,5 +66,5 @@ template void CheckValid(const size_t &size, const half *box, const half *img_me | |||
| cudaStream_t cuda_stream); | |||
| template void CheckValid(const size_t &size, const short *box, const short *img_metas, bool *valid, // NOLINT | |||
| cudaStream_t cuda_stream); | |||
| template void CheckValid(const size_t &size, const char *box, const char *img_metas, bool *valid, | |||
| template void CheckValid(const size_t &size, const unsigned char *box, const unsigned char *img_metas, bool *valid, | |||
| cudaStream_t cuda_stream); | |||
| @@ -73,7 +73,7 @@ template void ConcatKernel(const size_t size, const int input_num, | |||
| cudaStream_t cuda_stream); | |||
| template void ConcatKernel(const size_t size, const int input_num, | |||
| const int all_size_before_axis, const int all_size_axis, | |||
| int* len_axis, char** inputs, char* output, | |||
| int* len_axis, unsigned char** inputs, unsigned char* output, | |||
| cudaStream_t cuda_stream); | |||
| template void ConcatKernel(const size_t size, const int input_num, | |||
| const int all_size_before_axis, const int all_size_axis, | |||
| @@ -64,11 +64,12 @@ template void GatherNd<int, int>(int *input, int *indices, int *output, const si | |||
| const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, | |||
| int *batch_strides, cudaStream_t stream); | |||
| template void GatherNd<short, int>(short *input, int *indices, short *output, const size_t &output_dim0, // NOLINT | |||
| const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, | |||
| int *batch_strides, cudaStream_t stream); | |||
| template void GatherNd<char, int>(char *input, int *indices, char *output, const size_t &output_dim0, | |||
| const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, | |||
| int *batch_strides, cudaStream_t stream); | |||
| const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, | |||
| int *batch_strides, cudaStream_t stream); | |||
| template void GatherNd<unsigned char, int>(unsigned char *input, int *indices, unsigned char *output, | |||
| const size_t &output_dim0, const size_t &output_dim1, | |||
| const size_t &indices_dim1, int *batch_indices, int *batch_strides, | |||
| cudaStream_t stream); | |||
| template void GatherNd<bool, int>(bool *input, int *indices, bool *output, const size_t &output_dim0, | |||
| const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, | |||
| int *batch_strides, cudaStream_t stream); | |||
| @@ -180,8 +180,7 @@ template void CalSliceGrad<int>(const size_t input_size, const int *dy, const st | |||
| const std::vector<int> begin, const std::vector<int> size, int *output, | |||
| cudaStream_t cuda_stream); | |||
| // NOLINTNEXTLINE | |||
| template void FillDeviceArray<short>(const size_t input_size, short *addr, const float value, cudaStream_t cuda_stream); | |||
| template void FillDeviceArray<short>(const size_t input_size, short *addr, const float value, cudaStream_t cuda_stream); // NOLINT | |||
| template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2, | |||
| const int l3, const int l4, const int d1, const int d2, const int d3, const int d4, | |||
| const short *input, short *output, cudaStream_t stream); // NOLINT | |||
| @@ -189,13 +188,14 @@ template void CalSliceGrad<short>(const size_t input_size, const short *dy, cons | |||
| const std::vector<int> begin, const std::vector<int> size, short *output, // NOLINT | |||
| cudaStream_t cuda_stream); | |||
| template void FillDeviceArray<char>(const size_t input_size, char *addr, const float value, cudaStream_t cuda_stream); | |||
| template void FillDeviceArray<unsigned char>(const size_t input_size, unsigned char *addr, const float value, | |||
| cudaStream_t cuda_stream); | |||
| template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2, | |||
| const int l3, const int l4, const int d1, const int d2, const int d3, const int d4, | |||
| const char *input, char *output, cudaStream_t stream); | |||
| template void CalSliceGrad<char>(const size_t input_size, const char *dy, const std::vector<int> in_shape, | |||
| const std::vector<int> begin, const std::vector<int> size, char *output, | |||
| cudaStream_t cuda_stream); | |||
| const unsigned char *input, unsigned char *output, cudaStream_t stream); | |||
| template void CalSliceGrad<unsigned char>(const size_t input_size, const unsigned char *dy, | |||
| const std::vector<int> in_shape, const std::vector<int> begin, | |||
| const std::vector<int> size, unsigned char *output, cudaStream_t cuda_stream); | |||
| template void FillDeviceArray<bool>(const size_t input_size, bool *addr, const float value, cudaStream_t cuda_stream); | |||
| template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2, | |||
| @@ -215,12 +215,11 @@ template void StridedSlice(const std::vector<size_t> &input_shape, const std::ve | |||
| const std::vector<int> &strides, const std::vector<int> &output_shape, const int *input, | |||
| int *output, cudaStream_t cuda_stream); | |||
| template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin, | |||
| // NOLINTNEXTLINE | |||
| const std::vector<int> &strides, const std::vector<int> &output_shape, const short *input, | |||
| short *output, cudaStream_t cuda_stream); // NOLINT | |||
| const std::vector<int> &strides, const std::vector<int> &output_shape, | |||
| const short *input, short *output, cudaStream_t cuda_stream); // NOLINT | |||
| template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin, | |||
| const std::vector<int> &strides, const std::vector<int> &output_shape, const char *input, | |||
| char *output, cudaStream_t cuda_stream); | |||
| const std::vector<int> &strides, const std::vector<int> &output_shape, | |||
| const unsigned char *input, unsigned char *output, cudaStream_t cuda_stream); | |||
| template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin, | |||
| const std::vector<int> &strides, const std::vector<int> &output_shape, const bool *input, | |||
| bool *output, cudaStream_t cuda_stream); | |||
| @@ -235,12 +234,11 @@ template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vect | |||
| const std::vector<int> &strides, const std::vector<int> &dx_shape, const int *dy, | |||
| int *dx, cudaStream_t cuda_stream); | |||
| template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin, | |||
| // NOLINTNEXTLINE | |||
| const std::vector<int> &strides, const std::vector<int> &dx_shape, const short *dy, | |||
| const std::vector<int> &strides, const std::vector<int> &dx_shape, const short *dy, // NOLINT | |||
| short *dx, cudaStream_t cuda_stream); // NOLINT | |||
| template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin, | |||
| const std::vector<int> &strides, const std::vector<int> &dx_shape, const char *dy, | |||
| char *dx, cudaStream_t cuda_stream); | |||
| const std::vector<int> &strides, const std::vector<int> &dx_shape, | |||
| const unsigned char *dy, unsigned char *dx, cudaStream_t cuda_stream); | |||
| template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin, | |||
| const std::vector<int> &strides, const std::vector<int> &dx_shape, const bool *dy, | |||
| bool *dx, cudaStream_t cuda_stream); | |||
| @@ -64,6 +64,14 @@ class GpuKernelRegister { | |||
| } | |||
| }; | |||
| // This is necessary for gpu kernels to support uint8 data type. In cuda, an unsigned, | |||
| // 8 bit integral type is represented by an unsigned char, but the MS_REG_GPU_KERNEL | |||
| // macros defined below will create compilation errors when datatype T contains a space, | |||
| // because the variable created by the macro will also contain a space. So, we solve this | |||
| // problem by writing uchar when calling these macros, and expanding uchar after the | |||
| // variable has been created. | |||
| #define uchar unsigned char | |||
| #define MS_REG_GPU_KERNEL(OPNAME, OPCLASS) \ | |||
| static_assert(std::is_base_of<GpuKernel, OPCLASS>::value, " must be base of GpuKernel"); \ | |||
| static const GpuKernelRegister g_##OPNAME##_gpu_kernel_reg(#OPNAME, KernelAttr(), []() { return new OPCLASS(); }); | |||
| @@ -88,7 +96,6 @@ class GpuKernelRegister { | |||
| static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S>>::value, " must be base of GpuKernel"); \ | |||
| static const GpuKernelRegister g_##OPNAME##_##T##_##S##_gpu_kernel_reg(#OPNAME, ATTR, \ | |||
| []() { return new OPCLASS<T, S>(); }); | |||
| // register of mixed accuracy kernels which use template and maintain three typename | |||
| #define MS_REG_GPU_KERNEL_THREE(OPNAME, ATTR, OPCLASS, T, S, G) \ | |||
| static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S, G>>::value, " must be base of GpuKernel"); \ | |||
| @@ -31,6 +31,6 @@ MS_REG_GPU_KERNEL_TWO( | |||
| CheckValidGpuKernel, short, bool) // NOLINT | |||
| MS_REG_GPU_KERNEL_TWO( | |||
| CheckValid, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeBool), | |||
| CheckValidGpuKernel, char, bool) | |||
| CheckValidGpuKernel, uchar, bool) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||