Merge pull request !4479 from Peilin/gpu-reg-macro-fixtags/v0.7.0-beta
| @@ -32,7 +32,7 @@ MS_REG_GPU_KERNEL_ONE(Concat, | |||||
| ConcatV2GpuFwdKernel, short) // NOLINT | ConcatV2GpuFwdKernel, short) // NOLINT | ||||
| MS_REG_GPU_KERNEL_ONE(Concat, | MS_REG_GPU_KERNEL_ONE(Concat, | ||||
| KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), | KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), | ||||
| ConcatV2GpuFwdKernel, char) | |||||
| ConcatV2GpuFwdKernel, uchar) | |||||
| MS_REG_GPU_KERNEL_ONE(Concat, | MS_REG_GPU_KERNEL_ONE(Concat, | ||||
| KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool), | KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool), | ||||
| ConcatV2GpuFwdKernel, bool) | ConcatV2GpuFwdKernel, bool) | ||||
| @@ -34,7 +34,7 @@ MS_REG_GPU_KERNEL_TWO( | |||||
| GatherNdGpuFwdKernel, short, int) // NOLINT | GatherNdGpuFwdKernel, short, int) // NOLINT | ||||
| MS_REG_GPU_KERNEL_TWO( | MS_REG_GPU_KERNEL_TWO( | ||||
| GatherNd, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeUInt8), | GatherNd, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeUInt8), | ||||
| GatherNdGpuFwdKernel, char, int) | |||||
| GatherNdGpuFwdKernel, uchar, int) | |||||
| MS_REG_GPU_KERNEL_TWO( | MS_REG_GPU_KERNEL_TWO( | ||||
| GatherNd, KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool), | GatherNd, KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool), | ||||
| GatherNdGpuFwdKernel, bool, int) | GatherNdGpuFwdKernel, bool, int) | ||||
| @@ -27,7 +27,7 @@ MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeInt32). | |||||
| MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16), | MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16), | ||||
| StridedSliceGpuKernel, short) // NOLINT | StridedSliceGpuKernel, short) // NOLINT | ||||
| MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), | MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), | ||||
| StridedSliceGpuKernel, char) | |||||
| StridedSliceGpuKernel, uchar) | |||||
| MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool), | MS_REG_GPU_KERNEL_ONE(StridedSlice, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool), | ||||
| StridedSliceGpuKernel, bool) | StridedSliceGpuKernel, bool) | ||||
| } // namespace kernel | } // namespace kernel | ||||
| @@ -27,7 +27,7 @@ MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeInt | |||||
| MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16), | MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16), | ||||
| StridedSliceGradGpuKernel, short) // NOLINT | StridedSliceGradGpuKernel, short) // NOLINT | ||||
| MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), | MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8), | ||||
| StridedSliceGradGpuKernel, char) | |||||
| StridedSliceGradGpuKernel, uchar) | |||||
| MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool), | MS_REG_GPU_KERNEL_ONE(StridedSliceGrad, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool), | ||||
| StridedSliceGradGpuKernel, bool) | StridedSliceGradGpuKernel, bool) | ||||
| } // namespace kernel | } // namespace kernel | ||||
| @@ -37,7 +37,8 @@ __global__ void CheckValidKernel(const size_t size, const T *box, const T *img_m | |||||
| } | } | ||||
| template <typename S> | template <typename S> | ||||
| __global__ void CheckValidKernel(const size_t size, const char *box, const char *img_metas, S *valid) { | |||||
| __global__ void CheckValidKernel(const size_t size, const unsigned char *box, | |||||
| const unsigned char *img_metas, S *valid) { | |||||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) { | for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) { | ||||
| const size_t left_x = i * 4; | const size_t left_x = i * 4; | ||||
| const size_t left_y = i * 4 + 1; | const size_t left_y = i * 4 + 1; | ||||
| @@ -45,10 +46,8 @@ __global__ void CheckValidKernel(const size_t size, const char *box, const char | |||||
| const size_t right_y = i * 4 + 3; | const size_t right_y = i * 4 + 3; | ||||
| S valid_flag = false; | S valid_flag = false; | ||||
| valid_flag |= !((unsigned int)box[left_x] >= 0); | |||||
| valid_flag |= !((unsigned int)box[left_y] >= 0); | |||||
| valid_flag |= !((unsigned int)img_metas[0] * (unsigned int)img_metas[2] - 1 >= (unsigned int)box[right_x]); | |||||
| valid_flag |= !((unsigned int)img_metas[1] * (unsigned int)img_metas[2] - 1 >= (unsigned int)box[right_y]); | |||||
| valid_flag |= !(img_metas[0] * img_metas[2] >= box[right_x] + 1); | |||||
| valid_flag |= !(img_metas[1] * img_metas[2] >= box[right_y] + 1); | |||||
| valid[i] = !valid_flag; | valid[i] = !valid_flag; | ||||
| } | } | ||||
| @@ -67,5 +66,5 @@ template void CheckValid(const size_t &size, const half *box, const half *img_me | |||||
| cudaStream_t cuda_stream); | cudaStream_t cuda_stream); | ||||
| template void CheckValid(const size_t &size, const short *box, const short *img_metas, bool *valid, // NOLINT | template void CheckValid(const size_t &size, const short *box, const short *img_metas, bool *valid, // NOLINT | ||||
| cudaStream_t cuda_stream); | cudaStream_t cuda_stream); | ||||
| template void CheckValid(const size_t &size, const char *box, const char *img_metas, bool *valid, | |||||
| template void CheckValid(const size_t &size, const unsigned char *box, const unsigned char *img_metas, bool *valid, | |||||
| cudaStream_t cuda_stream); | cudaStream_t cuda_stream); | ||||
| @@ -73,7 +73,7 @@ template void ConcatKernel(const size_t size, const int input_num, | |||||
| cudaStream_t cuda_stream); | cudaStream_t cuda_stream); | ||||
| template void ConcatKernel(const size_t size, const int input_num, | template void ConcatKernel(const size_t size, const int input_num, | ||||
| const int all_size_before_axis, const int all_size_axis, | const int all_size_before_axis, const int all_size_axis, | ||||
| int* len_axis, char** inputs, char* output, | |||||
| int* len_axis, unsigned char** inputs, unsigned char* output, | |||||
| cudaStream_t cuda_stream); | cudaStream_t cuda_stream); | ||||
| template void ConcatKernel(const size_t size, const int input_num, | template void ConcatKernel(const size_t size, const int input_num, | ||||
| const int all_size_before_axis, const int all_size_axis, | const int all_size_before_axis, const int all_size_axis, | ||||
| @@ -64,11 +64,12 @@ template void GatherNd<int, int>(int *input, int *indices, int *output, const si | |||||
| const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, | const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, | ||||
| int *batch_strides, cudaStream_t stream); | int *batch_strides, cudaStream_t stream); | ||||
| template void GatherNd<short, int>(short *input, int *indices, short *output, const size_t &output_dim0, // NOLINT | template void GatherNd<short, int>(short *input, int *indices, short *output, const size_t &output_dim0, // NOLINT | ||||
| const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, | |||||
| int *batch_strides, cudaStream_t stream); | |||||
| template void GatherNd<char, int>(char *input, int *indices, char *output, const size_t &output_dim0, | |||||
| const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, | |||||
| int *batch_strides, cudaStream_t stream); | |||||
| const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, | |||||
| int *batch_strides, cudaStream_t stream); | |||||
| template void GatherNd<unsigned char, int>(unsigned char *input, int *indices, unsigned char *output, | |||||
| const size_t &output_dim0, const size_t &output_dim1, | |||||
| const size_t &indices_dim1, int *batch_indices, int *batch_strides, | |||||
| cudaStream_t stream); | |||||
| template void GatherNd<bool, int>(bool *input, int *indices, bool *output, const size_t &output_dim0, | template void GatherNd<bool, int>(bool *input, int *indices, bool *output, const size_t &output_dim0, | ||||
| const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, | const size_t &output_dim1, const size_t &indices_dim1, int *batch_indices, | ||||
| int *batch_strides, cudaStream_t stream); | int *batch_strides, cudaStream_t stream); | ||||
| @@ -180,8 +180,7 @@ template void CalSliceGrad<int>(const size_t input_size, const int *dy, const st | |||||
| const std::vector<int> begin, const std::vector<int> size, int *output, | const std::vector<int> begin, const std::vector<int> size, int *output, | ||||
| cudaStream_t cuda_stream); | cudaStream_t cuda_stream); | ||||
| // NOLINTNEXTLINE | |||||
| template void FillDeviceArray<short>(const size_t input_size, short *addr, const float value, cudaStream_t cuda_stream); | |||||
| template void FillDeviceArray<short>(const size_t input_size, short *addr, const float value, cudaStream_t cuda_stream); // NOLINT | |||||
| template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2, | template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2, | ||||
| const int l3, const int l4, const int d1, const int d2, const int d3, const int d4, | const int l3, const int l4, const int d1, const int d2, const int d3, const int d4, | ||||
| const short *input, short *output, cudaStream_t stream); // NOLINT | const short *input, short *output, cudaStream_t stream); // NOLINT | ||||
| @@ -189,13 +188,14 @@ template void CalSliceGrad<short>(const size_t input_size, const short *dy, cons | |||||
| const std::vector<int> begin, const std::vector<int> size, short *output, // NOLINT | const std::vector<int> begin, const std::vector<int> size, short *output, // NOLINT | ||||
| cudaStream_t cuda_stream); | cudaStream_t cuda_stream); | ||||
| template void FillDeviceArray<char>(const size_t input_size, char *addr, const float value, cudaStream_t cuda_stream); | |||||
| template void FillDeviceArray<unsigned char>(const size_t input_size, unsigned char *addr, const float value, | |||||
| cudaStream_t cuda_stream); | |||||
| template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2, | template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2, | ||||
| const int l3, const int l4, const int d1, const int d2, const int d3, const int d4, | const int l3, const int l4, const int d1, const int d2, const int d3, const int d4, | ||||
| const char *input, char *output, cudaStream_t stream); | |||||
| template void CalSliceGrad<char>(const size_t input_size, const char *dy, const std::vector<int> in_shape, | |||||
| const std::vector<int> begin, const std::vector<int> size, char *output, | |||||
| cudaStream_t cuda_stream); | |||||
| const unsigned char *input, unsigned char *output, cudaStream_t stream); | |||||
| template void CalSliceGrad<unsigned char>(const size_t input_size, const unsigned char *dy, | |||||
| const std::vector<int> in_shape, const std::vector<int> begin, | |||||
| const std::vector<int> size, unsigned char *output, cudaStream_t cuda_stream); | |||||
| template void FillDeviceArray<bool>(const size_t input_size, bool *addr, const float value, cudaStream_t cuda_stream); | template void FillDeviceArray<bool>(const size_t input_size, bool *addr, const float value, cudaStream_t cuda_stream); | ||||
| template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2, | template void Slice4DKernel(const int s1, const int s2, const int s3, const int s4, const int l1, const int l2, | ||||
| @@ -215,12 +215,11 @@ template void StridedSlice(const std::vector<size_t> &input_shape, const std::ve | |||||
| const std::vector<int> &strides, const std::vector<int> &output_shape, const int *input, | const std::vector<int> &strides, const std::vector<int> &output_shape, const int *input, | ||||
| int *output, cudaStream_t cuda_stream); | int *output, cudaStream_t cuda_stream); | ||||
| template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin, | template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin, | ||||
| // NOLINTNEXTLINE | |||||
| const std::vector<int> &strides, const std::vector<int> &output_shape, const short *input, | |||||
| short *output, cudaStream_t cuda_stream); // NOLINT | |||||
| const std::vector<int> &strides, const std::vector<int> &output_shape, | |||||
| const short *input, short *output, cudaStream_t cuda_stream); // NOLINT | |||||
| template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin, | template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin, | ||||
| const std::vector<int> &strides, const std::vector<int> &output_shape, const char *input, | |||||
| char *output, cudaStream_t cuda_stream); | |||||
| const std::vector<int> &strides, const std::vector<int> &output_shape, | |||||
| const unsigned char *input, unsigned char *output, cudaStream_t cuda_stream); | |||||
| template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin, | template void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin, | ||||
| const std::vector<int> &strides, const std::vector<int> &output_shape, const bool *input, | const std::vector<int> &strides, const std::vector<int> &output_shape, const bool *input, | ||||
| bool *output, cudaStream_t cuda_stream); | bool *output, cudaStream_t cuda_stream); | ||||
| @@ -235,12 +234,11 @@ template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vect | |||||
| const std::vector<int> &strides, const std::vector<int> &dx_shape, const int *dy, | const std::vector<int> &strides, const std::vector<int> &dx_shape, const int *dy, | ||||
| int *dx, cudaStream_t cuda_stream); | int *dx, cudaStream_t cuda_stream); | ||||
| template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin, | template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin, | ||||
| // NOLINTNEXTLINE | |||||
| const std::vector<int> &strides, const std::vector<int> &dx_shape, const short *dy, | |||||
| const std::vector<int> &strides, const std::vector<int> &dx_shape, const short *dy, // NOLINT | |||||
| short *dx, cudaStream_t cuda_stream); // NOLINT | short *dx, cudaStream_t cuda_stream); // NOLINT | ||||
| template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin, | template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin, | ||||
| const std::vector<int> &strides, const std::vector<int> &dx_shape, const char *dy, | |||||
| char *dx, cudaStream_t cuda_stream); | |||||
| const std::vector<int> &strides, const std::vector<int> &dx_shape, | |||||
| const unsigned char *dy, unsigned char *dx, cudaStream_t cuda_stream); | |||||
| template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin, | template void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin, | ||||
| const std::vector<int> &strides, const std::vector<int> &dx_shape, const bool *dy, | const std::vector<int> &strides, const std::vector<int> &dx_shape, const bool *dy, | ||||
| bool *dx, cudaStream_t cuda_stream); | bool *dx, cudaStream_t cuda_stream); | ||||
| @@ -64,6 +64,14 @@ class GpuKernelRegister { | |||||
| } | } | ||||
| }; | }; | ||||
| // This is necessary for gpu kernels to support uint8 data type. In cuda, an unsigned, | |||||
| // 8 bit integral type is represented by an unsigned char, but the MS_REG_GPU_KERNEL | |||||
| // macros defined below will create compilation errors when datatype T contains a space, | |||||
| // because the variable created by the macro will also contain a space. So, we solve this | |||||
| // problem by writing uchar when calling these macros, and expanding uchar after the | |||||
| // variable has been created. | |||||
| #define uchar unsigned char | |||||
| #define MS_REG_GPU_KERNEL(OPNAME, OPCLASS) \ | #define MS_REG_GPU_KERNEL(OPNAME, OPCLASS) \ | ||||
| static_assert(std::is_base_of<GpuKernel, OPCLASS>::value, " must be base of GpuKernel"); \ | static_assert(std::is_base_of<GpuKernel, OPCLASS>::value, " must be base of GpuKernel"); \ | ||||
| static const GpuKernelRegister g_##OPNAME##_gpu_kernel_reg(#OPNAME, KernelAttr(), []() { return new OPCLASS(); }); | static const GpuKernelRegister g_##OPNAME##_gpu_kernel_reg(#OPNAME, KernelAttr(), []() { return new OPCLASS(); }); | ||||
| @@ -88,7 +96,6 @@ class GpuKernelRegister { | |||||
| static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S>>::value, " must be base of GpuKernel"); \ | static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S>>::value, " must be base of GpuKernel"); \ | ||||
| static const GpuKernelRegister g_##OPNAME##_##T##_##S##_gpu_kernel_reg(#OPNAME, ATTR, \ | static const GpuKernelRegister g_##OPNAME##_##T##_##S##_gpu_kernel_reg(#OPNAME, ATTR, \ | ||||
| []() { return new OPCLASS<T, S>(); }); | []() { return new OPCLASS<T, S>(); }); | ||||
| // register of mixed accuracy kernels which use template and maintain three typename | // register of mixed accuracy kernels which use template and maintain three typename | ||||
| #define MS_REG_GPU_KERNEL_THREE(OPNAME, ATTR, OPCLASS, T, S, G) \ | #define MS_REG_GPU_KERNEL_THREE(OPNAME, ATTR, OPCLASS, T, S, G) \ | ||||
| static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S, G>>::value, " must be base of GpuKernel"); \ | static_assert(std::is_base_of<GpuKernel, OPCLASS<T, S, G>>::value, " must be base of GpuKernel"); \ | ||||
| @@ -31,6 +31,6 @@ MS_REG_GPU_KERNEL_TWO( | |||||
| CheckValidGpuKernel, short, bool) // NOLINT | CheckValidGpuKernel, short, bool) // NOLINT | ||||
| MS_REG_GPU_KERNEL_TWO( | MS_REG_GPU_KERNEL_TWO( | ||||
| CheckValid, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeBool), | CheckValid, KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeBool), | ||||
| CheckValidGpuKernel, char, bool) | |||||
| CheckValidGpuKernel, uchar, bool) | |||||
| } // namespace kernel | } // namespace kernel | ||||
| } // namespace mindspore | } // namespace mindspore | ||||