| @@ -53,6 +53,13 @@ __global__ void ReciprocalKernel(T *input, T *output, size_t count) { | |||
| return; | |||
| } | |||
| template <typename T> | |||
| __global__ void SquareKernel(T *input, T *output, size_t count) { | |||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { | |||
| output[i] = input[i] * input[i]; | |||
| } | |||
| return; | |||
| } | |||
| template <typename T> | |||
| void Exponential(T *input, T *output, size_t count, cudaStream_t cuda_stream) { | |||
| ExponentialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); | |||
| return; | |||
| @@ -72,12 +79,18 @@ void Reciprocal(T *input, T *output, size_t count, cudaStream_t cuda_stream) { | |||
| ReciprocalKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); | |||
| return; | |||
| } | |||
| template <typename T> | |||
| void Square(T *input, T *output, size_t count, cudaStream_t cuda_stream) { | |||
| SquareKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); | |||
| return; | |||
| } | |||
| template void Exponential<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Logarithm<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Negative<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Reciprocal<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Square<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Exponential<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Logarithm<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Negative<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Reciprocal<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); | |||
| template void Square<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); | |||
| @@ -26,5 +26,6 @@ template <typename T> | |||
| void Negative(T *input, T *output, size_t count, cudaStream_t cuda_stream); | |||
| template <typename T> | |||
| void Reciprocal(T *input, T *output, size_t count, cudaStream_t cuda_stream); | |||
| template <typename T> | |||
| void Square(T *input, T *output, size_t count, cudaStream_t cuda_stream); | |||
| #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_UNARYOPIMPL_H_ | |||
| @@ -41,8 +41,9 @@ void GpuKernelFactory::CheckIOParam(const std::string &kernel_name, const Kernel | |||
| size_t attr_index) { | |||
| if (kernel_info->GetInputNum() != iter_second->at(attr_index).first.GetInputSize()) { | |||
| if (iter_second->at(attr_index).first.GetAllSame()) { | |||
| auto dtype = iter_second->at(attr_index).first.GetInputAttr(0).first; | |||
| for (size_t attr = 1; attr < kernel_info->GetInputNum(); ++attr) { | |||
| (void)iter_second->at(attr_index).first.AddInputAttr(kernel_info->GetInputDeviceType(0)); | |||
| (void)iter_second->at(attr_index).first.AddInputAttr(dtype); | |||
| } | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "op[" << kernel_name << "] Input size is mismatching!"; | |||
| @@ -50,8 +51,9 @@ void GpuKernelFactory::CheckIOParam(const std::string &kernel_name, const Kernel | |||
| } | |||
| if (kernel_info->GetOutputNum() != iter_second->at(attr_index).first.GetOutputSize()) { | |||
| if (iter_second->at(attr_index).first.GetAllSame()) { | |||
| auto dtype = iter_second->at(attr_index).first.GetOutputAttr(0).first; | |||
| for (size_t attr = 1; attr < kernel_info->GetOutputNum(); ++attr) { | |||
| (void)iter_second->at(attr_index).first.AddOutputAttr(kernel_info->GetOutputDeviceType(0)); | |||
| (void)iter_second->at(attr_index).first.AddOutputAttr(dtype); | |||
| } | |||
| } else { | |||
| MS_LOG(EXCEPTION) << "op[" << kernel_name << "] Output size is mismatching!"; | |||
| @@ -38,5 +38,13 @@ MS_REG_GPU_KERNEL_ONE( | |||
| MS_REG_GPU_KERNEL_ONE( | |||
| Sub, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| BinaryOpGpuKernel, half) | |||
| MS_REG_GPU_KERNEL_ONE( | |||
| Maximum, | |||
| KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| BinaryOpGpuKernel, float) | |||
| MS_REG_GPU_KERNEL_ONE( | |||
| Maximum, | |||
| KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| BinaryOpGpuKernel, half) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -27,12 +27,16 @@ | |||
| #include "kernel/gpu/kernel_constants.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| enum BinaryOpType { BINARY_OP_ADD = 0, BINARY_OP_SUB, BINARY_OP_MUL, BINARY_OP_DIV, BINARY_OP_INVALID_TYPE = 255 }; | |||
| const std::map<std::string, BinaryOpType> kBinaryOpTypeMap = { | |||
| {"Sub", BINARY_OP_SUB}, | |||
| {"Mul", BINARY_OP_MUL}, | |||
| {"RealDiv", BINARY_OP_DIV}, | |||
| enum BinaryOpType { | |||
| BINARY_OP_ADD = 0, | |||
| BINARY_OP_SUB, | |||
| BINARY_OP_MUL, | |||
| BINARY_OP_DIV, | |||
| BINARY_OP_MAX, | |||
| BINARY_OP_INVALID_TYPE = 255 | |||
| }; | |||
| static const std::map<std::string, BinaryOpType> kBinaryOpTypeMap = { | |||
| {"Sub", BINARY_OP_SUB}, {"Mul", BINARY_OP_MUL}, {"RealDiv", BINARY_OP_DIV}, {"Maximum", BINARY_OP_MAX}}; | |||
| template <typename T> | |||
| class BinaryOpGpuKernel : public GpuKernel { | |||
| public: | |||
| @@ -84,6 +88,10 @@ class BinaryOpGpuKernel : public GpuKernel { | |||
| inputB_addr = workspace_addr; | |||
| break; | |||
| } | |||
| case BINARY_OP_MAX: { | |||
| inputB_addr = input_addr2; | |||
| break; | |||
| } | |||
| default: { | |||
| MS_LOG(EXCEPTION) << "Binary operation " << binary_op_type_ << " is not supported."; | |||
| } | |||
| @@ -201,6 +209,10 @@ class BinaryOpGpuKernel : public GpuKernel { | |||
| tensor_op_ = CUDNN_OP_TENSOR_ADD; | |||
| break; | |||
| } | |||
| case BINARY_OP_MAX: { | |||
| tensor_op_ = CUDNN_OP_TENSOR_MAX; | |||
| break; | |||
| } | |||
| default: { | |||
| MS_LOG(EXCEPTION) << "Binary operation " << binary_op_type_ << " is not supported."; | |||
| } | |||
| @@ -38,5 +38,9 @@ MS_REG_GPU_KERNEL_ONE(ZerosLike, KernelAttr().AddInputAttr(kNumberTypeFloat32).A | |||
| UnaryOpGpuKernel, float) | |||
| MS_REG_GPU_KERNEL_ONE(ZerosLike, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| UnaryOpGpuKernel, half) | |||
| MS_REG_GPU_KERNEL_ONE(Square, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), | |||
| UnaryOpGpuKernel, float) | |||
| MS_REG_GPU_KERNEL_ONE(Square, KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), | |||
| UnaryOpGpuKernel, half) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -33,13 +33,15 @@ enum UnaryOptype { | |||
| UNARY_OP_NEG, | |||
| UNARY_OP_RECIPROCAL, | |||
| UNARY_OP_ZEROSLIKE, | |||
| UNARY_OP_SQUARE, | |||
| UNARY_OP_INVALID_TYPE = 255 | |||
| }; | |||
| const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY_OP_EXP}, | |||
| {"Log", UNARY_OP_LOG}, | |||
| {"Neg", UNARY_OP_NEG}, | |||
| {"Reciprocal", UNARY_OP_RECIPROCAL}, | |||
| {"ZerosLike", UNARY_OP_ZEROSLIKE}}; | |||
| static const std::map<std::string, UnaryOptype> kUnaryOpTypeMap = {{"Exp", UNARY_OP_EXP}, | |||
| {"Log", UNARY_OP_LOG}, | |||
| {"Neg", UNARY_OP_NEG}, | |||
| {"Reciprocal", UNARY_OP_RECIPROCAL}, | |||
| {"ZerosLike", UNARY_OP_ZEROSLIKE}, | |||
| {"Square", UNARY_OP_SQUARE}}; | |||
| template <typename T> | |||
| class UnaryOpGpuKernel : public GpuKernel { | |||
| public: | |||
| @@ -74,6 +76,10 @@ class UnaryOpGpuKernel : public GpuKernel { | |||
| Reciprocal(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| break; | |||
| } | |||
| case UNARY_OP_SQUARE: { | |||
| Square(input_addr, output_addr, inputs[0]->size / sizeof(T), reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| break; | |||
| } | |||
| case UNARY_OP_ZEROSLIKE: { | |||
| return true; | |||
| } | |||
| @@ -93,12 +99,12 @@ class UnaryOpGpuKernel : public GpuKernel { | |||
| } | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| if (input_num != 1) { | |||
| MS_LOG(ERROR) << "Input number is " << input_num << ", but negative op needs 1 inputs."; | |||
| MS_LOG(ERROR) << "Input number is " << input_num << ", but unary op needs 1 inputs."; | |||
| return false; | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (output_num != 1) { | |||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but negative op needs 1 output."; | |||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but unary op needs 1 output."; | |||
| return false; | |||
| } | |||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| @@ -48,14 +48,10 @@ class FlattenGpuFwdKernel : public GpuKernel { | |||
| } | |||
| bool Init(const CNodePtr &kernel_node) override { | |||
| auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| input_size_ = sizeof(T); | |||
| for (size_t i = 0; i < shape.size(); ++i) { | |||
| if (input_size_ == 0) { | |||
| input_size_ = 1; | |||
| } | |||
| input_size_ *= shape[i]; | |||
| } | |||
| input_size_ = input_size_ * sizeof(T); | |||
| InitSizeLists(); | |||
| return true; | |||
| } | |||
| @@ -189,6 +189,12 @@ VectorRef MsBackend::MsRunGraph(const GraphId &g, const VectorRef &args) { | |||
| } else if (utils::isa<PyObjectRef>(arg)) { | |||
| auto value = utils::cast<PyObjectRef>(arg).object_; | |||
| inputs.push_back(py::cast<tensor::TensorPtr>(value)); | |||
| } else if (utils::isa<VectorRefPtr>(arg)) { | |||
| auto args_new = utils::cast<VectorRef>(arg); | |||
| (void)std::transform(args_new.begin(), args_new.end(), std::back_inserter(inputs), | |||
| [](const BaseRef &v) { return utils::cast<tensor::TensorPtr>(v); }); | |||
| } else { | |||
| MS_LOG(WARNING) << "Invalid input type."; | |||
| } | |||
| } | |||