|
|
|
@@ -103,6 +103,35 @@ __global__ void ZeroslikeKernel(T *output, size_t count) { |
|
|
|
return; |
|
|
|
} |
|
|
|
template <typename T> |
|
|
|
__global__ void AbsKernel(T *input, T *output, size_t count) { |
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
|
output[i] = abs(input[i]); |
|
|
|
} |
|
|
|
return; |
|
|
|
} |
|
|
|
template <> |
|
|
|
__global__ void AbsKernel(half *input, half *output, size_t count) { |
|
|
|
half zero = 0.0; |
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
|
output[i] = input[i] < zero ? -input[i] : input[i]; |
|
|
|
} |
|
|
|
return; |
|
|
|
} |
|
|
|
template <typename T> |
|
|
|
__global__ void FloorKernel(T *input, T *output, size_t count) { |
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
|
output[i] = floor(input[i]); |
|
|
|
} |
|
|
|
return; |
|
|
|
} |
|
|
|
template <> |
|
|
|
__global__ void FloorKernel(half *input, half *output, size_t count) { |
|
|
|
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { |
|
|
|
output[i] = hfloor(input[i]); |
|
|
|
} |
|
|
|
return; |
|
|
|
} |
|
|
|
template <typename T> |
|
|
|
void Exponential(T *input, T *output, size_t count, cudaStream_t cuda_stream) { |
|
|
|
ExponentialKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); |
|
|
|
return; |
|
|
|
@@ -147,6 +176,16 @@ void Zeroslike(T *output, size_t count, cudaStream_t cuda_stream) { |
|
|
|
ZeroslikeKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(output, count); |
|
|
|
return; |
|
|
|
} |
|
|
|
template <typename T> |
|
|
|
void Abs(T *input, T *output, size_t count, cudaStream_t cuda_stream) { |
|
|
|
AbsKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); |
|
|
|
return; |
|
|
|
} |
|
|
|
template <typename T> |
|
|
|
void Floor(T *input, T *output, size_t count, cudaStream_t cuda_stream) { |
|
|
|
FloorKernel<<<GET_BLOCKS(count), GET_THREADS, 0, cuda_stream>>>(input, output, count); |
|
|
|
return; |
|
|
|
} |
|
|
|
|
|
|
|
template void Exponential<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); |
|
|
|
template void Logarithm<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); |
|
|
|
@@ -156,6 +195,8 @@ template void Square<float>(float *input, float *output, size_t count, cudaStrea |
|
|
|
template void Sqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); |
|
|
|
template void Rsqrt<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); |
|
|
|
template void Zeroslike<float>(float *output, size_t count, cudaStream_t cuda_stream); |
|
|
|
template void Abs<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); |
|
|
|
template void Floor<float>(float *input, float *output, size_t count, cudaStream_t cuda_stream); |
|
|
|
template void Exponential<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); |
|
|
|
template void Logarithm<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); |
|
|
|
template void Negative<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); |
|
|
|
@@ -164,3 +205,5 @@ template void Square<half>(half *input, half *output, size_t count, cudaStream_t |
|
|
|
template void Sqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); |
|
|
|
template void Rsqrt<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); |
|
|
|
template void Zeroslike<half>(half *output, size_t count, cudaStream_t cuda_stream); |
|
|
|
template void Abs<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); |
|
|
|
template void Floor<half>(half *input, half *output, size_t count, cudaStream_t cuda_stream); |