From 6a46ef35a486b6699b096d5798d33e7c9c0ec76c Mon Sep 17 00:00:00 2001 From: ZPaC Date: Sat, 19 Sep 2020 19:16:45 +0800 Subject: [PATCH] GPU momentum supports use_nesterov --- .../gpu/cuda_impl/momentum_impl.cu | 66 +++++++++++++------ .../gpu/cuda_impl/momentum_impl.cuh | 2 +- .../gpu/nn/momentum_gpu_kernel.h | 11 +++- 3 files changed, 57 insertions(+), 22 deletions(-) diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cu index a91a9138b6..1e94fd57d5 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cu +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cu @@ -17,36 +17,60 @@ #include "momentum_impl.cuh" template __global__ void MomentumUpdateVariableKernel(const size_t size, T *variable, T *accumulation, const S *learning_rate, - const G *gradient, const S *momentum) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) { - accumulation[i] = momentum[0] * accumulation[i] + gradient[i]; - variable[i] -= learning_rate[0] * accumulation[i]; + const G *gradient, const S *momentum, bool use_nesterov) { + if (use_nesterov) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) { + accumulation[i] = momentum[0] * accumulation[i] + gradient[i]; + variable[i] -= gradient[i] * learning_rate[0] + accumulation[i] * momentum[0] * learning_rate[0]; + } + } else { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) { + accumulation[i] = momentum[0] * accumulation[i] + gradient[i]; + variable[i] -= learning_rate[0] * accumulation[i]; + } } return; } template <> __global__ void MomentumUpdateVariableKernel(const size_t size, half *variable, half *accumulation, - const float *learning_rate, const half *gradient, const float *momentum) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) { - accumulation[i] = __float2half(momentum[0]) * accumulation[i] + gradient[i]; - variable[i] -= __float2half(learning_rate[0]) * accumulation[i]; + const float *learning_rate, const half *gradient, const float *momentum, + bool use_nesterov) { + if (use_nesterov) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) { + accumulation[i] = __float2half(momentum[0]) * accumulation[i] + gradient[i]; + variable[i] -= gradient[i] * __float2half(learning_rate[0]) + + accumulation[i] * __float2half(momentum[0]) * __float2half(learning_rate[0]); + } + } else { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) { + accumulation[i] = __float2half(momentum[0]) * accumulation[i] + gradient[i]; + variable[i] -= __float2half(learning_rate[0]) * accumulation[i]; + } } return; } template <> __global__ void MomentumUpdateVariableKernel(const size_t size, float *variable, float *accumulation, - const float *learning_rate, const half *gradient, const float *momentum) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) { - accumulation[i] = momentum[0] * accumulation[i] + __half2float(gradient[i]); - variable[i] -= learning_rate[0] * accumulation[i]; + const float *learning_rate, const half *gradient, const float *momentum, + bool use_nesterov) { + if (use_nesterov) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) { + accumulation[i] = momentum[0] * accumulation[i] + __half2float(gradient[i]); + variable[i] -= __half2float(gradient[i]) * learning_rate[0] + accumulation[i] * momentum[0] * learning_rate[0]; + } + } else { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (size); i += blockDim.x * gridDim.x) { + accumulation[i] = momentum[0] * accumulation[i] + __half2float(gradient[i]); + variable[i] -= learning_rate[0] * accumulation[i]; + } } return; } template void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient, - const S *momentum, cudaStream_t cuda_stream) { - MomentumUpdateVariableKernel<<>>(size, variable, accumulation, - learning_rate, gradient, momentum); + const S *momentum, bool use_nesterov, cudaStream_t cuda_stream) { + MomentumUpdateVariableKernel<<>>( + size, variable, accumulation, learning_rate, gradient, momentum, use_nesterov); return; } @@ -91,16 +115,20 @@ void FusedScaleMomentum(const size_t element_num, T *scale, T *variable, T *accu template void MomentumUpdateVariable(const size_t size, float *variable, float *accumulation, const float *learning_rate, const float *gradient, - const float *momentum, cudaStream_t cuda_stream); + const float *momentum, bool use_nesterov, + cudaStream_t cuda_stream); template void MomentumUpdateVariable(const size_t size, half *variable, half *accumulation, const half *learning_rate, const half *gradient, - const half *momentum, cudaStream_t cuda_stream); + const half *momentum, bool use_nesterov, + cudaStream_t cuda_stream); template void MomentumUpdateVariable(const size_t size, half *variable, half *accumulation, const float *learning_rate, const half *gradient, - const float *momentum, cudaStream_t cuda_stream); + const float *momentum, bool use_nesterov, + cudaStream_t cuda_stream); template void MomentumUpdateVariable(const size_t size, float *variable, float *accumulation, const float *learning_rate, const half *gradient, - const float *momentum, cudaStream_t cuda_stream); + const float *momentum, bool use_nesterov, + cudaStream_t cuda_stream); template void FusedWeightDecayScaleMomentum(const size_t element_num, float *weight_decay, float *scale, float *variable, float *accumulation, const float *learning_rate, diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh index 00fa7afb2a..7ce15c97ed 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/momentum_impl.cuh @@ -20,7 +20,7 @@ #include "runtime/device/gpu/cuda_common.h" template void MomentumUpdateVariable(const size_t size, T *variable, T *accumulation, const S *learning_rate, const G *gradient, - const S *momentum, cudaStream_t cuda_stream); + const S *momentum, bool use_nesterov, cudaStream_t cuda_stream); template void FusedWeightDecayScaleMomentum(const size_t element_num, T *weight_decay, T *scale, T *variable, T *accumulation, const T *learning_rate, const S *gradient, const T *momentum, diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.h index 091e2d6e9f..2386105d0c 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/nn/momentum_gpu_kernel.h @@ -27,7 +27,12 @@ template class MomentumGpuKernel : public GpuKernel { public: MomentumGpuKernel() - : variable_size_(0), accumulation_size_(0), learning_rate_size_(0), gradient_size_(0), momentum_size_(0) {} + : use_nesterov_(false), + variable_size_(0), + accumulation_size_(0), + learning_rate_size_(0), + gradient_size_(0), + momentum_size_(0) {} ~MomentumGpuKernel() override = default; const std::vector &GetInputSizeList() const override { return input_size_list_; } const std::vector &GetOutputSizeList() const override { return output_size_list_; } @@ -41,7 +46,7 @@ class MomentumGpuKernel : public GpuKernel { G *gradient = GetDeviceAddress(inputs, 3); S *momentum = GetDeviceAddress(inputs, 4); MomentumUpdateVariable(inputs[0]->size / sizeof(T), variable, accumulation, learning_rate, gradient, momentum, - reinterpret_cast(stream_ptr)); + use_nesterov_, reinterpret_cast(stream_ptr)); return true; } bool Init(const CNodePtr &kernel_node) override { @@ -50,6 +55,7 @@ class MomentumGpuKernel : public GpuKernel { MS_LOG(ERROR) << "Input number is " << input_num << ", but momentum needs 5 inputs."; return false; } + use_nesterov_ = GetAttr(kernel_node, "use_nesterov"); variable_size_ = sizeof(T); accumulation_size_ = sizeof(T); @@ -84,6 +90,7 @@ class MomentumGpuKernel : public GpuKernel { } private: + bool use_nesterov_; size_t variable_size_; size_t accumulation_size_; size_t learning_rate_size_;