|
|
|
@@ -1,15 +1,12 @@ |
|
|
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable |
|
|
|
|
|
|
|
#define SLICES 4 |
|
|
|
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) |
|
|
|
#define MIN(X, Y) (X < Y ? X : Y) |
|
|
|
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; |
|
|
|
|
|
|
|
__kernel void LeakyRelu(__read_only image2d_t input, __write_only image2d_t output, const int4 img_shape, |
|
|
|
__kernel void LeakyRelu(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape, |
|
|
|
const float alpha) { |
|
|
|
int Y = get_global_id(0); // H |
|
|
|
int X = get_global_id(1); // W C4 |
|
|
|
if (X >= img_shape.z || Y >= img_shape.y) return; |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= img_shape.x || Y >= img_shape.y) return; |
|
|
|
FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); |
|
|
|
FLT4 tmp; |
|
|
|
FLT alpha_f = TO_FLT(alpha); |
|
|
|
@@ -20,53 +17,40 @@ __kernel void LeakyRelu(__read_only image2d_t input, __write_only image2d_t outp |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), tmp); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void Relu(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) { |
|
|
|
int Y = get_global_id(0); |
|
|
|
int X = get_global_id(1); |
|
|
|
if (X >= input_shape.z || Y >= input_shape.y) return; |
|
|
|
__kernel void Relu(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= img_shape.x || Y >= img_shape.y) return; |
|
|
|
FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); |
|
|
|
FLT4 tmp; |
|
|
|
tmp.x = in_c4.x > 0.0f ? in_c4.x : 0.0f; |
|
|
|
tmp.y = in_c4.y > 0.0f ? in_c4.y : 0.0f; |
|
|
|
tmp.z = in_c4.z > 0.0f ? in_c4.z : 0.0f; |
|
|
|
tmp.w = in_c4.w > 0.0f ? in_c4.w : 0.0f; |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), tmp); |
|
|
|
in_c4 = max(in_c4, (FLT)(0.f)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), in_c4); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) { |
|
|
|
int Y = get_global_id(0); |
|
|
|
int X = get_global_id(1); |
|
|
|
if (X >= input_shape.z || Y >= input_shape.y) return; |
|
|
|
__kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= img_shape.x || Y >= img_shape.y) return; |
|
|
|
FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); |
|
|
|
FLT4 tmp; |
|
|
|
tmp.x = in_c4.x > 0.0f ? MIN(in_c4.x, 6.0f) : 0.0f; |
|
|
|
tmp.y = in_c4.y > 0.0f ? MIN(in_c4.y, 6.0f) : 0.0f; |
|
|
|
tmp.z = in_c4.z > 0.0f ? MIN(in_c4.z, 6.0f) : 0.0f; |
|
|
|
tmp.w = in_c4.w > 0.0f ? MIN(in_c4.w, 6.0f) : 0.0f; |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), tmp); |
|
|
|
in_c4 = clamp(in_c4, (FLT)(0.f), (FLT)(6.f)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), in_c4); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape) { |
|
|
|
int Y = get_global_id(0); |
|
|
|
int X = get_global_id(1); |
|
|
|
if (X >= input_shape.z || Y >= input_shape.y) return; |
|
|
|
__kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= img_shape.x || Y >= img_shape.y) return; |
|
|
|
FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); |
|
|
|
FLT4 tmp; |
|
|
|
tmp.x = 1.0f / (1.0f + exp(-in_c4.x)); |
|
|
|
tmp.y = 1.0f / (1.0f + exp(-in_c4.y)); |
|
|
|
tmp.z = 1.0f / (1.0f + exp(-in_c4.z)); |
|
|
|
tmp.w = 1.0f / (1.0f + exp(-in_c4.w)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), tmp); |
|
|
|
in_c4 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-in_c4)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), in_c4); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void Tanh(__read_only image2d_t input, __write_only image2d_t output, int4 input_shape) { |
|
|
|
int Y = get_global_id(0); |
|
|
|
int X = get_global_id(1); |
|
|
|
if (X >= input_shape.z || Y >= input_shape.y) return; |
|
|
|
__kernel void Tanh(__read_only image2d_t input, __write_only image2d_t output, const int2 img_shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= img_shape.x || Y >= img_shape.y) return; |
|
|
|
FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(X, Y)); |
|
|
|
in_c4.x = (exp(in_c4.x) - exp(-in_c4.x)) / (exp(in_c4.x) + exp(-in_c4.x)); |
|
|
|
in_c4.y = (exp(in_c4.y) - exp(-in_c4.y)) / (exp(in_c4.y) + exp(-in_c4.y)); |
|
|
|
in_c4.z = (exp(in_c4.z) - exp(-in_c4.z)) / (exp(in_c4.z) + exp(-in_c4.z)); |
|
|
|
in_c4.w = (exp(in_c4.w) - exp(-in_c4.w)) / (exp(in_c4.w) + exp(-in_c4.w)); |
|
|
|
FLT4 exp0 = exp(in_c4); |
|
|
|
FLT4 exp1 = exp(-in_c4); |
|
|
|
in_c4 = (exp0 - exp1) / (exp0 + exp1); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), in_c4); |
|
|
|
} |