| @@ -26,45 +26,171 @@ __device__ __forceinline__ half SqrtFunc(half input) { | |||
| return hsqrt(input); | |||
| } | |||
| template <typename T> | |||
| template <typename T, typename S, typename G> | |||
| __global__ void ApplyAdagradKernel(const size_t size, | |||
| const bool update_slots, | |||
| const T *learning_rate, | |||
| const T *gradient, | |||
| const S *learning_rate, | |||
| const G *gradient, | |||
| T *variable, | |||
| T *accumulation) { | |||
| T *accumulation, | |||
| T *variable_out, | |||
| T *accumulation_out) { | |||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) { | |||
| if (update_slots) { | |||
| accumulation[i] += gradient[i] * gradient[i]; | |||
| accumulation_out[i] = accumulation[i]; | |||
| } | |||
| variable[i] -= learning_rate[0] * gradient[i] / SqrtFunc(accumulation[i]); | |||
| variable_out[i] = variable[i]; | |||
| } | |||
| } | |||
| template <typename T> | |||
| template <> | |||
| __global__ void ApplyAdagradKernel(const size_t size, | |||
| const bool update_slots, | |||
| const float *learning_rate, | |||
| const half *gradient, | |||
| half *variable, | |||
| half *accumulation, | |||
| half *variable_out, | |||
| half *accumulation_out) { | |||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) { | |||
| if (update_slots) { | |||
| accumulation[i] += gradient[i] * gradient[i]; | |||
| accumulation_out[i] = accumulation[i]; | |||
| } | |||
| variable[i] -= __float2half(learning_rate[0]) * gradient[i] / SqrtFunc(accumulation[i]); | |||
| variable_out[i] = variable[i]; | |||
| } | |||
| } | |||
| template <> | |||
| __global__ void ApplyAdagradKernel(const size_t size, | |||
| const bool update_slots, | |||
| const float *learning_rate, | |||
| const half *gradient, | |||
| float *variable, | |||
| float *accumulation, | |||
| float *variable_out, | |||
| float *accumulation_out) { | |||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) { | |||
| if (update_slots) { | |||
| accumulation[i] += __half2float(gradient[i]) * __half2float(gradient[i]); | |||
| accumulation_out[i] = accumulation[i]; | |||
| } | |||
| variable[i] -= learning_rate[0] * __half2float(gradient[i]) / SqrtFunc(accumulation[i]); | |||
| variable_out[i] = variable[i]; | |||
| } | |||
| } | |||
| template <> | |||
| __global__ void ApplyAdagradKernel(const size_t size, | |||
| const bool update_slots, | |||
| const half *learning_rate, | |||
| const float *gradient, | |||
| float *variable, | |||
| float *accumulation, | |||
| float *variable_out, | |||
| float *accumulation_out) { | |||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) { | |||
| if (update_slots) { | |||
| accumulation[i] += gradient[i] * gradient[i]; | |||
| accumulation_out[i] = accumulation[i]; | |||
| } | |||
| variable[i] -= __half2float(learning_rate[0]) * gradient[i] / SqrtFunc(accumulation[i]); | |||
| variable_out[i] = variable[i]; | |||
| } | |||
| } | |||
| template <> | |||
| __global__ void ApplyAdagradKernel(const size_t size, | |||
| const bool update_slots, | |||
| const float *learning_rate, | |||
| const float *gradient, | |||
| half *variable, | |||
| half *accumulation, | |||
| half *variable_out, | |||
| half *accumulation_out) { | |||
| for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) { | |||
| if (update_slots) { | |||
| accumulation[i] += __float2half(gradient[i]) * __float2half(gradient[i]); | |||
| accumulation_out[i] = accumulation[i]; | |||
| } | |||
| variable[i] -= __float2half(learning_rate[0]) * __float2half(gradient[i]) / SqrtFunc(accumulation[i]); | |||
| variable_out[i] = variable[i]; | |||
| } | |||
| } | |||
| template <typename T, typename S, typename G> | |||
| void ApplyAdagrad(const size_t size, | |||
| const bool update_slots, | |||
| const T *learning_rate, | |||
| const T *gradient, | |||
| const S *learning_rate, | |||
| const G *gradient, | |||
| T *variable, | |||
| T *accumulation, | |||
| T *variable_out, | |||
| T *accumulation_out, | |||
| cudaStream_t cuda_stream) { | |||
| ApplyAdagradKernel<<< GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>( | |||
| size, update_slots, learning_rate, gradient, variable, accumulation); | |||
| size, update_slots, learning_rate, gradient, variable, accumulation, variable_out, accumulation_out); | |||
| } | |||
| template void ApplyAdagrad<float>(const size_t size, | |||
| template void ApplyAdagrad<float, float, float>(const size_t size, | |||
| const bool update_slots, | |||
| const float *learning_rate, | |||
| const float *gradient, | |||
| float *variable, | |||
| float *accumulation, | |||
| float *variable_out, | |||
| float *accumulation_out, | |||
| cudaStream_t cuda_stream); | |||
| template void ApplyAdagrad<half>(const size_t size, | |||
| template void ApplyAdagrad<half, half, half>(const size_t size, | |||
| const bool update_slots, | |||
| const half *learning_rate, | |||
| const half *gradient, | |||
| half *variable, | |||
| half *accumulation, | |||
| half *variable_out, | |||
| half *accumulation_out, | |||
| cudaStream_t cuda_stream); | |||
| template void ApplyAdagrad<half, float, half>(const size_t size, | |||
| const bool update_slots, | |||
| const float *learning_rate, | |||
| const half *gradient, | |||
| half *variable, | |||
| half *accumulation, | |||
| half *variable_out, | |||
| half *accumulation_out, | |||
| cudaStream_t cuda_stream); | |||
| template void ApplyAdagrad<float, float, half>(const size_t size, | |||
| const bool update_slots, | |||
| const float *learning_rate, | |||
| const half *gradient, | |||
| float *variable, | |||
| float *accumulation, | |||
| float *variable_out, | |||
| float *accumulation_out, | |||
| cudaStream_t cuda_stream); | |||
| template void ApplyAdagrad<float, half, float>(const size_t size, | |||
| const bool update_slots, | |||
| const half *learning_rate, | |||
| const float *gradient, | |||
| float *variable, | |||
| float *accumulation, | |||
| float *variable_out, | |||
| float *accumulation_out, | |||
| cudaStream_t cuda_stream); | |||
| template void ApplyAdagrad<half, float, float>(const size_t size, | |||
| const bool update_slots, | |||
| const float *learning_rate, | |||
| const float *gradient, | |||
| half *variable, | |||
| half *accumulation, | |||
| half *variable_out, | |||
| half *accumulation_out, | |||
| cudaStream_t cuda_stream); | |||
| @@ -18,13 +18,15 @@ | |||
| #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAGRAD_IMPL_H_ | |||
| #include "runtime/device/gpu/cuda_common.h" | |||
| template <typename T> | |||
| template <typename T, typename S, typename G> | |||
| void ApplyAdagrad(const size_t size, | |||
| const bool update_slots, | |||
| const T *learning_rate, | |||
| const T *gradient, | |||
| const S *learning_rate, | |||
| const G *gradient, | |||
| T *variable, | |||
| T *accumulation, | |||
| T *variable_out, | |||
| T *accumulation_out, | |||
| cudaStream_t stream); | |||
| #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMP_ADAGRAD_IMPL_H_ | |||
| @@ -18,23 +18,59 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| MS_REG_GPU_KERNEL_ONE(ApplyAdagrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| AdagradGpuKernel, float) | |||
| MS_REG_GPU_KERNEL_ONE(ApplyAdagrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| AdagradGpuKernel, half) | |||
| MS_REG_GPU_KERNEL_THREE(ApplyAdagrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| AdagradGpuKernel, float, float, float) | |||
| MS_REG_GPU_KERNEL_THREE(ApplyAdagrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| AdagradGpuKernel, half, half, half) | |||
| MS_REG_GPU_KERNEL_THREE(ApplyAdagrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| AdagradGpuKernel, half, float, half) | |||
| MS_REG_GPU_KERNEL_THREE(ApplyAdagrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| AdagradGpuKernel, float, float, half) | |||
| MS_REG_GPU_KERNEL_THREE(ApplyAdagrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| AdagradGpuKernel, float, half, float) | |||
| MS_REG_GPU_KERNEL_THREE(ApplyAdagrad, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddOutputAttr(kNumberTypeFloat16) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| AdagradGpuKernel, half, float, float) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -24,7 +24,7 @@ | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T> | |||
| template <typename T, typename S, typename G> | |||
| class AdagradGpuKernel : public GpuKernel { | |||
| public: | |||
| AdagradGpuKernel() | |||
| @@ -36,6 +36,19 @@ class AdagradGpuKernel : public GpuKernel { | |||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | |||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> & /*workspace*/, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | |||
| T *variable = GetDeviceAddress<T>(inputs, 0); | |||
| T *accumulation = GetDeviceAddress<T>(inputs, 1); | |||
| S *learning_rate = GetDeviceAddress<S>(inputs, 2); | |||
| G *gradient = GetDeviceAddress<G>(inputs, 3); | |||
| T *variable_out = GetDeviceAddress<T>(outputs, 0); | |||
| T *accumulation_out = GetDeviceAddress<T>(outputs, 1); | |||
| ApplyAdagrad(inputs[0]->size / sizeof(T), update_slots, learning_rate, gradient, variable, accumulation, | |||
| variable_out, accumulation_out, reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| return true; | |||
| } | |||
| bool Init(const CNodePtr &kernel_node) override { | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| update_slots = AnfAlgo::GetNodeAttr<bool>(kernel_node, "update_slots"); | |||
| @@ -45,47 +58,35 @@ class AdagradGpuKernel : public GpuKernel { | |||
| } | |||
| variable_size_ = sizeof(T); | |||
| accumulation_size_ = sizeof(T); | |||
| learning_rate_size_ = sizeof(T); | |||
| gradient_size_ = sizeof(T); | |||
| learning_rate_size_ = sizeof(S); | |||
| gradient_size_ = sizeof(G); | |||
| auto variable_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| auto variable_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2); | |||
| for (size_t i = 0; i < variable_shape.size(); i++) { | |||
| variable_size_ *= variable_shape[i]; | |||
| } | |||
| auto accumulation_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); | |||
| auto accumulation_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3); | |||
| for (size_t i = 0; i < accumulation_shape.size(); i++) { | |||
| accumulation_size_ *= accumulation_shape[i]; | |||
| } | |||
| auto gradient_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 3); | |||
| auto gradient_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); | |||
| for (size_t i = 0; i < gradient_shape.size(); i++) { | |||
| gradient_size_ *= gradient_shape[i]; | |||
| } | |||
| InitSizeLists(); | |||
| return true; | |||
| } | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &, | |||
| void *stream_ptr) override { | |||
| T *variable = GetDeviceAddress<T>(inputs, 0); | |||
| T *accumulation = GetDeviceAddress<T>(inputs, 1); | |||
| T *learning_rate = GetDeviceAddress<T>(inputs, 2); | |||
| T *gradient = GetDeviceAddress<T>(inputs, 3); | |||
| ApplyAdagrad(inputs[0]->size / sizeof(T), update_slots, learning_rate, gradient, variable, accumulation, | |||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| return true; | |||
| } | |||
| protected: | |||
| void InitSizeLists() override { | |||
| input_size_list_.push_back(variable_size_); | |||
| input_size_list_.push_back(accumulation_size_); | |||
| input_size_list_.push_back(learning_rate_size_); | |||
| input_size_list_.push_back(gradient_size_); | |||
| output_size_list_.push_back(0); | |||
| output_size_list_.push_back(0); | |||
| output_size_list_.push_back(variable_size_); | |||
| output_size_list_.push_back(accumulation_size_); | |||
| } | |||
| private: | |||