|
|
|
@@ -32,16 +32,12 @@ __global__ void ApplyAdagradKernel(const size_t size, |
|
|
|
const S *learning_rate, |
|
|
|
const G *gradient, |
|
|
|
T *variable, |
|
|
|
T *accumulation, |
|
|
|
T *variable_out, |
|
|
|
T *accumulation_out) { |
|
|
|
T *accumulation) { |
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) { |
|
|
|
if (update_slots) { |
|
|
|
accumulation[i] += gradient[i] * gradient[i]; |
|
|
|
accumulation_out[i] = accumulation[i]; |
|
|
|
} |
|
|
|
variable[i] -= learning_rate[0] * gradient[i] / SqrtFunc(accumulation[i]); |
|
|
|
variable_out[i] = variable[i]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
@@ -51,16 +47,12 @@ __global__ void ApplyAdagradKernel(const size_t size, |
|
|
|
const float *learning_rate, |
|
|
|
const half *gradient, |
|
|
|
half *variable, |
|
|
|
half *accumulation, |
|
|
|
half *variable_out, |
|
|
|
half *accumulation_out) { |
|
|
|
half *accumulation) { |
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) { |
|
|
|
if (update_slots) { |
|
|
|
accumulation[i] += gradient[i] * gradient[i]; |
|
|
|
accumulation_out[i] = accumulation[i]; |
|
|
|
} |
|
|
|
variable[i] -= __float2half(learning_rate[0]) * gradient[i] / SqrtFunc(accumulation[i]); |
|
|
|
variable_out[i] = variable[i]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
@@ -70,16 +62,12 @@ __global__ void ApplyAdagradKernel(const size_t size, |
|
|
|
const float *learning_rate, |
|
|
|
const half *gradient, |
|
|
|
float *variable, |
|
|
|
float *accumulation, |
|
|
|
float *variable_out, |
|
|
|
float *accumulation_out) { |
|
|
|
float *accumulation) { |
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) { |
|
|
|
if (update_slots) { |
|
|
|
accumulation[i] += __half2float(gradient[i]) * __half2float(gradient[i]); |
|
|
|
accumulation_out[i] = accumulation[i]; |
|
|
|
} |
|
|
|
variable[i] -= learning_rate[0] * __half2float(gradient[i]) / SqrtFunc(accumulation[i]); |
|
|
|
variable_out[i] = variable[i]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
@@ -89,16 +77,12 @@ __global__ void ApplyAdagradKernel(const size_t size, |
|
|
|
const half *learning_rate, |
|
|
|
const float *gradient, |
|
|
|
float *variable, |
|
|
|
float *accumulation, |
|
|
|
float *variable_out, |
|
|
|
float *accumulation_out) { |
|
|
|
float *accumulation) { |
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) { |
|
|
|
if (update_slots) { |
|
|
|
accumulation[i] += gradient[i] * gradient[i]; |
|
|
|
accumulation_out[i] = accumulation[i]; |
|
|
|
} |
|
|
|
variable[i] -= __half2float(learning_rate[0]) * gradient[i] / SqrtFunc(accumulation[i]); |
|
|
|
variable_out[i] = variable[i]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
@@ -108,16 +92,12 @@ __global__ void ApplyAdagradKernel(const size_t size, |
|
|
|
const float *learning_rate, |
|
|
|
const float *gradient, |
|
|
|
half *variable, |
|
|
|
half *accumulation, |
|
|
|
half *variable_out, |
|
|
|
half *accumulation_out) { |
|
|
|
half *accumulation) { |
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) { |
|
|
|
if (update_slots) { |
|
|
|
accumulation[i] += __float2half(gradient[i]) * __float2half(gradient[i]); |
|
|
|
accumulation_out[i] = accumulation[i]; |
|
|
|
} |
|
|
|
variable[i] -= __float2half(learning_rate[0]) * __float2half(gradient[i]) / SqrtFunc(accumulation[i]); |
|
|
|
variable_out[i] = variable[i]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
@@ -128,11 +108,9 @@ void ApplyAdagrad(const size_t size, |
|
|
|
const G *gradient, |
|
|
|
T *variable, |
|
|
|
T *accumulation, |
|
|
|
T *variable_out, |
|
|
|
T *accumulation_out, |
|
|
|
cudaStream_t cuda_stream) { |
|
|
|
ApplyAdagradKernel<<< GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>( |
|
|
|
size, update_slots, learning_rate, gradient, variable, accumulation, variable_out, accumulation_out); |
|
|
|
size, update_slots, learning_rate, gradient, variable, accumulation); |
|
|
|
} |
|
|
|
|
|
|
|
template void ApplyAdagrad<float, float, float>(const size_t size, |
|
|
|
@@ -141,8 +119,6 @@ template void ApplyAdagrad<float, float, float>(const size_t size, |
|
|
|
const float *gradient, |
|
|
|
float *variable, |
|
|
|
float *accumulation, |
|
|
|
float *variable_out, |
|
|
|
float *accumulation_out, |
|
|
|
cudaStream_t cuda_stream); |
|
|
|
|
|
|
|
template void ApplyAdagrad<half, half, half>(const size_t size, |
|
|
|
@@ -151,8 +127,6 @@ template void ApplyAdagrad<half, half, half>(const size_t size, |
|
|
|
const half *gradient, |
|
|
|
half *variable, |
|
|
|
half *accumulation, |
|
|
|
half *variable_out, |
|
|
|
half *accumulation_out, |
|
|
|
cudaStream_t cuda_stream); |
|
|
|
|
|
|
|
template void ApplyAdagrad<half, float, half>(const size_t size, |
|
|
|
@@ -161,8 +135,6 @@ template void ApplyAdagrad<half, float, half>(const size_t size, |
|
|
|
const half *gradient, |
|
|
|
half *variable, |
|
|
|
half *accumulation, |
|
|
|
half *variable_out, |
|
|
|
half *accumulation_out, |
|
|
|
cudaStream_t cuda_stream); |
|
|
|
|
|
|
|
template void ApplyAdagrad<float, float, half>(const size_t size, |
|
|
|
@@ -171,8 +143,6 @@ template void ApplyAdagrad<float, float, half>(const size_t size, |
|
|
|
const half *gradient, |
|
|
|
float *variable, |
|
|
|
float *accumulation, |
|
|
|
float *variable_out, |
|
|
|
float *accumulation_out, |
|
|
|
cudaStream_t cuda_stream); |
|
|
|
|
|
|
|
template void ApplyAdagrad<float, half, float>(const size_t size, |
|
|
|
@@ -181,8 +151,6 @@ template void ApplyAdagrad<float, half, float>(const size_t size, |
|
|
|
const float *gradient, |
|
|
|
float *variable, |
|
|
|
float *accumulation, |
|
|
|
float *variable_out, |
|
|
|
float *accumulation_out, |
|
|
|
cudaStream_t cuda_stream); |
|
|
|
|
|
|
|
template void ApplyAdagrad<half, float, float>(const size_t size, |
|
|
|
@@ -191,6 +159,4 @@ template void ApplyAdagrad<half, float, float>(const size_t size, |
|
|
|
const float *gradient, |
|
|
|
half *variable, |
|
|
|
half *accumulation, |
|
|
|
half *variable_out, |
|
|
|
half *accumulation_out, |
|
|
|
cudaStream_t cuda_stream); |