Merge pull request !4436 from peixu_ren/custom_gputags/v0.7.0-beta
| @@ -19,19 +19,26 @@ template <typename T> | |||||
| __global__ void NormalKernel(int seed, curandState *globalState, T *output, size_t count) { | __global__ void NormalKernel(int seed, curandState *globalState, T *output, size_t count) { | ||||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { | for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { | ||||
| curand_init(seed, i, 0, &globalState[i]); | curand_init(seed, i, 0, &globalState[i]); | ||||
| output[i] = curand_normal(&globalState[i]); | |||||
| output[i] = (T)curand_normal(&globalState[i]); | |||||
| } | } | ||||
| return; | return; | ||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| __global__ void UniformKernel(int seed, curandState *globalState, T *input1, size_t input_size_1, | |||||
| T *input2, size_t input_size_2, T *output, size_t count) { | |||||
| __global__ void UniformIntKernel(int seed, curandState *globalState, T *input1, size_t input_size_1, | |||||
| T *input2, size_t input_size_2, T *output, size_t count) { | |||||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { | for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { | ||||
| input1[i] = (input_size_1 == 1 ? input1[0] : input1[i]); | |||||
| input2[i] = (input_size_2 == 1 ? input2[0] : input2[i]); | |||||
| curand_init(seed, i, 0, &globalState[i]); | curand_init(seed, i, 0, &globalState[i]); | ||||
| output[i] = curand_uniform(&globalState[i]) * (input2[i] - input1[i]) + input1[i]; | |||||
| output[i] = (T)(curand_uniform(&globalState[i])) * (input2[0] - input1[0]) + input1[0]; | |||||
| } | |||||
| return; | |||||
| } | |||||
| template <typename T> | |||||
| __global__ void UniformRealKernel(int seed, curandState *globalState, T *output, size_t count) { | |||||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { | |||||
| curand_init(seed, i, 0, &globalState[i]); | |||||
| output[i] = (T)curand_uniform(&globalState[i]); | |||||
| } | } | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -51,16 +58,46 @@ void StandardNormal(int seed, int seed2, curandState *globalState, T *output, si | |||||
| } | } | ||||
| template <typename T> | template <typename T> | ||||
| void UniformReal(int seed, curandState *globalState, T *input1, size_t input_size_1, | |||||
| T *input2, size_t input_size_2, T *output, size_t count, cudaStream_t cuda_stream) { | |||||
| seed = (seed == 0 ? time(NULL):seed); | |||||
| UniformKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>> | |||||
| (seed, globalState, input1, input_size_1, input2, input_size_2, output, count); | |||||
| void UniformInt(int seed, int seed2, curandState *globalState, T *input1, size_t input_size_1, | |||||
| T *input2, size_t input_size_2, T *output, size_t count, cudaStream_t cuda_stream) { | |||||
| int RNG_seed = 0; | |||||
| if (seed2 != 0) { | |||||
| RNG_seed = seed2; | |||||
| } else if (seed != 0) { | |||||
| RNG_seed = seed; | |||||
| } else { | |||||
| RNG_seed = time(NULL); | |||||
| } | |||||
| UniformIntKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>> | |||||
| (RNG_seed, globalState, input1, input_size_1, input2, input_size_2, output, count); | |||||
| return; | |||||
| } | |||||
| template <typename T> | |||||
| void UniformReal(int seed, int seed2, curandState *globalState, T *output, size_t count, cudaStream_t cuda_stream) { | |||||
| int RNG_seed = 0; | |||||
| if (seed2 != 0) { | |||||
| RNG_seed = seed2; | |||||
| } else if (seed != 0) { | |||||
| RNG_seed = seed; | |||||
| } else { | |||||
| RNG_seed = time(NULL); | |||||
| } | |||||
| UniformRealKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(RNG_seed, globalState, output, count); | |||||
| return; | return; | ||||
| } | } | ||||
| template void StandardNormal<float>(int seed, int seed2, curandState *globalState, | template void StandardNormal<float>(int seed, int seed2, curandState *globalState, | ||||
| float *output, size_t count, cudaStream_t cuda_stream); | float *output, size_t count, cudaStream_t cuda_stream); | ||||
| template void UniformReal<float>(int seed, curandState *globalState, float *input1, size_t input_size_1, | |||||
| float *input2, size_t input_size_2, float *output, size_t count, | |||||
| cudaStream_t cuda_stream); | |||||
| template void StandardNormal<int>(int seed, int seed2, curandState *globalState, | |||||
| int *output, size_t count, cudaStream_t cuda_stream); | |||||
| template void UniformInt<float>(int seed, int seed2, curandState *globalState, float *input1, size_t input_size_1, | |||||
| float *input2, size_t input_size_2, float *output, size_t count, | |||||
| cudaStream_t cuda_stream); | |||||
| template void UniformInt<int>(int seed, int seed2, curandState *globalState, int *input1, size_t input_size_1, | |||||
| int *input2, size_t input_size_2, int *output, size_t count, | |||||
| cudaStream_t cuda_stream); | |||||
| template void UniformReal<float>(int seed, int seed2, curandState *globalState, | |||||
| float *output, size_t count, cudaStream_t cuda_stream); | |||||
| template void UniformReal<int>(int seed, int seed2, curandState *globalState, | |||||
| int *output, size_t count, cudaStream_t cuda_stream); | |||||
| @@ -24,7 +24,10 @@ template <typename T> | |||||
| void StandardNormal(int seed, int seed2, curandState *globalState, | void StandardNormal(int seed, int seed2, curandState *globalState, | ||||
| T *output, size_t count, cudaStream_t cuda_stream); | T *output, size_t count, cudaStream_t cuda_stream); | ||||
| template <typename T> | template <typename T> | ||||
| void UniformReal(int seed, curandState *globalState, | |||||
| T *input1, size_t input_size_1, T *input2, size_t input_size_2, | |||||
| T *output, size_t count, cudaStream_t cuda_stream); | |||||
| void UniformInt(int seed, int seed2, curandState *globalState, | |||||
| T *input1, size_t input_size_1, T *input2, size_t input_size_2, | |||||
| T *output, size_t count, cudaStream_t cuda_stream); | |||||
| template <typename T> | |||||
| void UniformReal(int seed, int seed2, curandState *globalState, | |||||
| T *output, size_t count, cudaStream_t cuda_stream); | |||||
| #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RANDOMOPIMPL_H_ | #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_RANDOMOPIMPL_H_ | ||||
| @@ -20,12 +20,14 @@ namespace mindspore { | |||||
| namespace kernel { | namespace kernel { | ||||
| MS_REG_GPU_KERNEL_ONE(StandardNormal, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32), | MS_REG_GPU_KERNEL_ONE(StandardNormal, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32), | ||||
| RandomOpGpuKernel, float) | RandomOpGpuKernel, float) | ||||
| MS_REG_GPU_KERNEL_ONE(UniformReal, | |||||
| MS_REG_GPU_KERNEL_ONE(UniformInt, | |||||
| KernelAttr() | KernelAttr() | ||||
| .AddInputAttr(kNumberTypeInt32) | .AddInputAttr(kNumberTypeInt32) | ||||
| .AddInputAttr(kNumberTypeFloat32) | |||||
| .AddInputAttr(kNumberTypeFloat32) | |||||
| .AddOutputAttr(kNumberTypeFloat32), | |||||
| .AddInputAttr(kNumberTypeInt32) | |||||
| .AddInputAttr(kNumberTypeInt32) | |||||
| .AddOutputAttr(kNumberTypeInt32), | |||||
| RandomOpGpuKernel, int) | |||||
| MS_REG_GPU_KERNEL_ONE(UniformReal, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32), | |||||
| RandomOpGpuKernel, float) | RandomOpGpuKernel, float) | ||||
| } // namespace kernel | } // namespace kernel | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -28,16 +28,17 @@ | |||||
| namespace mindspore { | namespace mindspore { | ||||
| namespace kernel { | namespace kernel { | ||||
| enum RandomOptype { RANDOM_OP_NORMAL = 0, RANDOM_OP_UNIFORM_REAL, RANDOM_OP_INVALID_TYPE = 255 }; | |||||
| enum RandomOptype { RANDOM_OP_NORMAL = 0, RANDOM_OP_UNIFORM_INT, RANDOM_OP_UNIFORM_REAL, RANDOM_OP_INVALID_TYPE = 255 }; | |||||
| const std::map<std::string, RandomOptype> kRandomOpTypeMap = { | |||||
| {"StandardNormal", RANDOM_OP_NORMAL}, {"UniformInt", RANDOM_OP_UNIFORM_INT}, {"UniformReal", RANDOM_OP_UNIFORM_REAL}}; | |||||
| const std::map<std::string, RandomOptype> kRandomOpTypeMap = {{"StandardNormal", RANDOM_OP_NORMAL}, | |||||
| {"UniformReal", RANDOM_OP_UNIFORM_REAL}}; | |||||
| template <typename T> | template <typename T> | ||||
| class RandomOpGpuKernel : public GpuKernel { | class RandomOpGpuKernel : public GpuKernel { | ||||
| public: | public: | ||||
| RandomOpGpuKernel() | RandomOpGpuKernel() | ||||
| : random_op_type_(RANDOM_OP_INVALID_TYPE), | : random_op_type_(RANDOM_OP_INVALID_TYPE), | ||||
| input_size_0_(sizeof(int)), | |||||
| input_size_0_(sizeof(0)), | |||||
| input_size_1_(sizeof(T)), | input_size_1_(sizeof(T)), | ||||
| input_size_2_(sizeof(T)), | input_size_2_(sizeof(T)), | ||||
| output_size_(sizeof(T)), | output_size_(sizeof(T)), | ||||
| @@ -62,11 +63,16 @@ class RandomOpGpuKernel : public GpuKernel { | |||||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | reinterpret_cast<cudaStream_t>(stream_ptr)); | ||||
| break; | break; | ||||
| } | } | ||||
| case RANDOM_OP_UNIFORM_REAL: { | |||||
| case RANDOM_OP_UNIFORM_INT: { | |||||
| T *input_addr_1 = GetDeviceAddress<T>(inputs, 1); | T *input_addr_1 = GetDeviceAddress<T>(inputs, 1); | ||||
| T *input_addr_2 = GetDeviceAddress<T>(inputs, 2); | T *input_addr_2 = GetDeviceAddress<T>(inputs, 2); | ||||
| UniformReal(seed_, devStates, input_addr_1, inputs[1]->size / sizeof(T), input_addr_2, | |||||
| inputs[2]->size / sizeof(T), output_addr, outputs[0]->size / sizeof(T), | |||||
| UniformInt(seed_, seed2_, devStates, input_addr_1, inputs[1]->size / sizeof(T), input_addr_2, | |||||
| inputs[2]->size / sizeof(T), output_addr, outputs[0]->size / sizeof(T), | |||||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | |||||
| break; | |||||
| } | |||||
| case RANDOM_OP_UNIFORM_REAL: { | |||||
| UniformReal(seed_, seed2_, devStates, output_addr, outputs[0]->size / sizeof(T), | |||||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | reinterpret_cast<cudaStream_t>(stream_ptr)); | ||||
| break; | break; | ||||
| } | } | ||||
| @@ -86,11 +92,11 @@ class RandomOpGpuKernel : public GpuKernel { | |||||
| random_op_type_ = iter->second; | random_op_type_ = iter->second; | ||||
| } | } | ||||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | ||||
| if (random_op_type_ == RANDOM_OP_NORMAL && input_num != 1) { | |||||
| if ((random_op_type_ == RANDOM_OP_NORMAL || random_op_type_ == RANDOM_OP_UNIFORM_REAL) && input_num != 1) { | |||||
| MS_LOG(ERROR) << "Input number is " << input_num << ", but random op needs 1 input."; | MS_LOG(ERROR) << "Input number is " << input_num << ", but random op needs 1 input."; | ||||
| return false; | return false; | ||||
| } | } | ||||
| if (random_op_type_ == RANDOM_OP_UNIFORM_REAL && input_num != 3) { | |||||
| if (random_op_type_ == RANDOM_OP_UNIFORM_INT && input_num != 3) { | |||||
| MS_LOG(ERROR) << "Input number is " << input_num << ", but random op needs 3 inputs."; | MS_LOG(ERROR) << "Input number is " << input_num << ", but random op needs 3 inputs."; | ||||
| return false; | return false; | ||||
| } | } | ||||
| @@ -104,15 +110,9 @@ class RandomOpGpuKernel : public GpuKernel { | |||||
| input_size_0_ += input_shape_0[i]; | input_size_0_ += input_shape_0[i]; | ||||
| } | } | ||||
| input_size_0_ *= sizeof(int); | input_size_0_ *= sizeof(int); | ||||
| if (random_op_type_ == RANDOM_OP_UNIFORM_REAL) { | |||||
| auto input_shape_1 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); | |||||
| for (size_t i = 0; i < input_shape_1.size(); i++) { | |||||
| input_size_1_ *= input_shape_1[i]; | |||||
| } | |||||
| auto input_shape_2 = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2); | |||||
| for (size_t i = 0; i < input_shape_2.size(); i++) { | |||||
| input_size_2_ *= input_shape_2[i]; | |||||
| } | |||||
| if (random_op_type_ == RANDOM_OP_UNIFORM_INT) { | |||||
| input_size_1_ *= 1; | |||||
| input_size_2_ *= 1; | |||||
| } | } | ||||
| auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); | auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); | ||||
| for (size_t i = 0; i < output_shape.size(); i++) { | for (size_t i = 0; i < output_shape.size(); i++) { | ||||
| @@ -120,9 +120,7 @@ class RandomOpGpuKernel : public GpuKernel { | |||||
| workspace_size_ *= output_shape[i]; | workspace_size_ *= output_shape[i]; | ||||
| } | } | ||||
| seed_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("seed")); | seed_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("seed")); | ||||
| if (random_op_type_ == RANDOM_OP_NORMAL) { | |||||
| seed2_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("seed2")); | |||||
| } | |||||
| seed2_ = GetValue<int>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("seed2")); | |||||
| InitSizeLists(); | InitSizeLists(); | ||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -130,7 +128,7 @@ class RandomOpGpuKernel : public GpuKernel { | |||||
| protected: | protected: | ||||
| void InitSizeLists() override { | void InitSizeLists() override { | ||||
| input_size_list_.push_back(input_size_0_); | input_size_list_.push_back(input_size_0_); | ||||
| if (random_op_type_ == RANDOM_OP_UNIFORM_REAL) { | |||||
| if (random_op_type_ == RANDOM_OP_UNIFORM_INT) { | |||||
| input_size_list_.push_back(input_size_1_); | input_size_list_.push_back(input_size_1_); | ||||
| input_size_list_.push_back(input_size_2_); | input_size_list_.push_back(input_size_2_); | ||||
| } | } | ||||