|
|
@@ -1,5 +1,5 @@ |
|
|
/** |
|
|
/** |
|
|
* Copyright 2020 Huawei Technologies Co., Ltd |
|
|
|
|
|
|
|
|
* Copyright 2020-2021 Huawei Technologies Co., Ltd |
|
|
* |
|
|
* |
|
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
|
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
|
|
* you may not use this file except in compliance with the License. |
|
|
* you may not use this file except in compliance with the License. |
|
|
@@ -26,6 +26,7 @@ __global__ void SqrtGradKernel(const T *input, const T *dout, T *output, const s |
|
|
} |
|
|
} |
|
|
return; |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
template <typename T> |
|
|
template <typename T> |
|
|
__global__ void RsqrtGradKernel(const T *input, const T *dout, T *output, const size_t count) { |
|
|
__global__ void RsqrtGradKernel(const T *input, const T *dout, T *output, const size_t count) { |
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
@@ -37,6 +38,7 @@ __global__ void RsqrtGradKernel(const T *input, const T *dout, T *output, const |
|
|
} |
|
|
} |
|
|
return; |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
template <typename T> |
|
|
template <typename T> |
|
|
__global__ void AsinGradKernel(const T *input, const T *dout, T *output, const size_t count) { |
|
|
__global__ void AsinGradKernel(const T *input, const T *dout, T *output, const size_t count) { |
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
@@ -46,6 +48,7 @@ __global__ void AsinGradKernel(const T *input, const T *dout, T *output, const s |
|
|
} |
|
|
} |
|
|
return; |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
template <> |
|
|
template <> |
|
|
__global__ void AsinGradKernel(const half *input, const half *dout, half *output, const size_t count) { |
|
|
__global__ void AsinGradKernel(const half *input, const half *dout, half *output, const size_t count) { |
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
@@ -55,6 +58,7 @@ __global__ void AsinGradKernel(const half *input, const half *dout, half *output |
|
|
} |
|
|
} |
|
|
return; |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
template <typename T> |
|
|
template <typename T> |
|
|
__global__ void ACosGradKernel(const T *input, const T *dout, T *output, const size_t count) { |
|
|
__global__ void ACosGradKernel(const T *input, const T *dout, T *output, const size_t count) { |
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
@@ -65,6 +69,7 @@ __global__ void ACosGradKernel(const T *input, const T *dout, T *output, const s |
|
|
} |
|
|
} |
|
|
return; |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
template <> |
|
|
template <> |
|
|
__global__ void ACosGradKernel(const half *input, const half *dout, half *output, const size_t count) { |
|
|
__global__ void ACosGradKernel(const half *input, const half *dout, half *output, const size_t count) { |
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
@@ -75,6 +80,7 @@ __global__ void ACosGradKernel(const half *input, const half *dout, half *output |
|
|
} |
|
|
} |
|
|
return; |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
template <typename T> |
|
|
template <typename T> |
|
|
__global__ void AtanGradKernel(const T *input, const T *dout, T *output, const size_t count) { |
|
|
__global__ void AtanGradKernel(const T *input, const T *dout, T *output, const size_t count) { |
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
@@ -84,6 +90,7 @@ __global__ void AtanGradKernel(const T *input, const T *dout, T *output, const s |
|
|
} |
|
|
} |
|
|
return; |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
template <typename T> |
|
|
template <typename T> |
|
|
__global__ void AsinhGradKernel(const T *input, const T *dout, T *output, const size_t count) { |
|
|
__global__ void AsinhGradKernel(const T *input, const T *dout, T *output, const size_t count) { |
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
@@ -93,6 +100,7 @@ __global__ void AsinhGradKernel(const T *input, const T *dout, T *output, const |
|
|
} |
|
|
} |
|
|
return; |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
template <typename T> |
|
|
template <typename T> |
|
|
__global__ void AcoshGradKernel(const T *input, const T *dout, T *output, const size_t count) { |
|
|
__global__ void AcoshGradKernel(const T *input, const T *dout, T *output, const size_t count) { |
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
@@ -102,11 +110,24 @@ __global__ void AcoshGradKernel(const T *input, const T *dout, T *output, const |
|
|
} |
|
|
} |
|
|
return; |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <typename T> |
|
|
|
|
|
__global__ void ReciprocalGradKernel(const T *input, const T *dout, T *output, const size_t count) { |
|
|
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) { |
|
|
|
|
|
float inputf = static_cast<float>(input[i]); |
|
|
|
|
|
float doutf = static_cast<float>(dout[i]); |
|
|
|
|
|
float res = -1 * doutf * inputf * inputf; |
|
|
|
|
|
output[i] = static_cast<T>(res); |
|
|
|
|
|
} |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
template <typename T> |
|
|
template <typename T> |
|
|
void SqrtGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream) { |
|
|
void SqrtGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream) { |
|
|
SqrtGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); |
|
|
SqrtGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); |
|
|
return; |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
template <typename T> |
|
|
template <typename T> |
|
|
void RsqrtGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream) { |
|
|
void RsqrtGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream) { |
|
|
RsqrtGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); |
|
|
RsqrtGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); |
|
|
@@ -143,20 +164,28 @@ void AcoshGrad(const T *input, const T *dout, T *output, const size_t count, cud |
|
|
return; |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <typename T> |
|
|
|
|
|
void ReciprocalGrad(const T *input, const T *dout, T *output, const size_t count, cudaStream_t cuda_stream) { |
|
|
|
|
|
ReciprocalGradKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, dout, output, count); |
|
|
|
|
|
return; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
template void SqrtGrad<float>(const float *input, const float *dout, float *output, const size_t count, |
|
|
template void SqrtGrad<float>(const float *input, const float *dout, float *output, const size_t count, |
|
|
cudaStream_t cuda_stream); |
|
|
cudaStream_t cuda_stream); |
|
|
template void RsqrtGrad<float>(const float *input, const float *dout, float *output, const size_t count, |
|
|
template void RsqrtGrad<float>(const float *input, const float *dout, float *output, const size_t count, |
|
|
cudaStream_t cuda_stream); |
|
|
cudaStream_t cuda_stream); |
|
|
template void AsinGrad<float>(const float *input, const float *dout, float *output, const size_t count, |
|
|
template void AsinGrad<float>(const float *input, const float *dout, float *output, const size_t count, |
|
|
cudaStream_t cuda_stream); |
|
|
|
|
|
|
|
|
cudaStream_t cuda_stream); |
|
|
template void ACosGrad<float>(const float *input, const float *dout, float *output, const size_t count, |
|
|
template void ACosGrad<float>(const float *input, const float *dout, float *output, const size_t count, |
|
|
cudaStream_t cuda_stream); |
|
|
|
|
|
|
|
|
cudaStream_t cuda_stream); |
|
|
template void AtanGrad<float>(const float *input, const float *dout, float *output, const size_t count, |
|
|
template void AtanGrad<float>(const float *input, const float *dout, float *output, const size_t count, |
|
|
cudaStream_t cuda_stream); |
|
|
cudaStream_t cuda_stream); |
|
|
template void AsinhGrad<float>(const float *input, const float *dout, float *output, const size_t count, |
|
|
template void AsinhGrad<float>(const float *input, const float *dout, float *output, const size_t count, |
|
|
cudaStream_t cuda_stream); |
|
|
|
|
|
|
|
|
cudaStream_t cuda_stream); |
|
|
template void AcoshGrad<float>(const float *input, const float *dout, float *output, const size_t count, |
|
|
template void AcoshGrad<float>(const float *input, const float *dout, float *output, const size_t count, |
|
|
cudaStream_t cuda_stream); |
|
|
cudaStream_t cuda_stream); |
|
|
|
|
|
template void ReciprocalGrad<float>(const float *input, const float *dout, float *output, const size_t count, |
|
|
|
|
|
cudaStream_t cuda_stream); |
|
|
template void SqrtGrad<half>(const half *input, const half *dout, half *output, const size_t count, |
|
|
template void SqrtGrad<half>(const half *input, const half *dout, half *output, const size_t count, |
|
|
cudaStream_t cuda_stream); |
|
|
cudaStream_t cuda_stream); |
|
|
template void RsqrtGrad<half>(const half *input, const half *dout, half *output, const size_t count, |
|
|
template void RsqrtGrad<half>(const half *input, const half *dout, half *output, const size_t count, |
|
|
@@ -164,10 +193,12 @@ template void RsqrtGrad<half>(const half *input, const half *dout, half *output, |
|
|
template void AsinGrad<half>(const half *input, const half *dout, half *output, const size_t count, |
|
|
template void AsinGrad<half>(const half *input, const half *dout, half *output, const size_t count, |
|
|
cudaStream_t cuda_stream); |
|
|
cudaStream_t cuda_stream); |
|
|
template void ACosGrad<half>(const half *input, const half *dout, half *output, const size_t count, |
|
|
template void ACosGrad<half>(const half *input, const half *dout, half *output, const size_t count, |
|
|
cudaStream_t cuda_stream); |
|
|
|
|
|
|
|
|
cudaStream_t cuda_stream); |
|
|
template void AtanGrad<half>(const half *input, const half *dout, half *output, const size_t count, |
|
|
template void AtanGrad<half>(const half *input, const half *dout, half *output, const size_t count, |
|
|
cudaStream_t cuda_stream); |
|
|
cudaStream_t cuda_stream); |
|
|
template void AsinhGrad<half>(const half *input, const half *dout, half *output, const size_t count, |
|
|
template void AsinhGrad<half>(const half *input, const half *dout, half *output, const size_t count, |
|
|
cudaStream_t cuda_stream); |
|
|
|
|
|
|
|
|
cudaStream_t cuda_stream); |
|
|
template void AcoshGrad<half>(const half *input, const half *dout, half *output, const size_t count, |
|
|
template void AcoshGrad<half>(const half *input, const half *dout, half *output, const size_t count, |
|
|
cudaStream_t cuda_stream); |
|
|
cudaStream_t cuda_stream); |
|
|
|
|
|
template void ReciprocalGrad<half>(const half *input, const half *dout, half *output, const size_t count, |
|
|
|
|
|
cudaStream_t cuda_stream); |