|
|
|
@@ -3,7 +3,7 @@ |
|
|
|
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; |
|
|
|
|
|
|
|
__kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -12,24 +12,13 @@ __kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_ |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a + b); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementAddReLU_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
return; |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), max(a + b, (FLT4)(0.f))); |
|
|
|
FLT4 result = a + b; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -38,11 +27,13 @@ __kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_ |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a - b); |
|
|
|
FLT4 result = a - b; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -51,11 +42,13 @@ __kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_ |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a * b); |
|
|
|
FLT4 result = a * b; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -64,11 +57,13 @@ __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_ |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b)); |
|
|
|
FLT4 result = divide_no_check(a, b); |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -77,11 +72,13 @@ __kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_ |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(AS_UINT4(a) & AS_UINT4(b))); |
|
|
|
FLT4 result = AS_FLT4(AS_UINT4(a) & AS_UINT4(b)); |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -90,11 +87,13 @@ __kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(AS_UINT4(a) | AS_UINT4(b))); |
|
|
|
FLT4 result = AS_FLT4(AS_UINT4(a) | AS_UINT4(b)); |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -103,11 +102,13 @@ __kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_ |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), max(a, b)); |
|
|
|
FLT4 result = max(a, b); |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -116,11 +117,14 @@ __kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_ |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), min(a, b)); |
|
|
|
FLT4 result = min(a, b); |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
__write_only image2d_t output, const int2 output_shape, float act_min, |
|
|
|
float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -129,11 +133,14 @@ __kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only ima |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, b))); |
|
|
|
FLT4 result = floor(divide_no_check(a, b)); |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
__write_only image2d_t output, const int2 output_shape, float act_min, |
|
|
|
float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -142,11 +149,14 @@ __kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only ima |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, b)) * b); |
|
|
|
FLT4 result = floor(divide_no_check(a, b)) * b; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
__write_only image2d_t output, const int2 output_shape, float act_min, |
|
|
|
float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -155,11 +165,13 @@ __kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), pown((a - b), (int4)2)); |
|
|
|
FLT4 result = pown((a - b), (int4)2); |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -168,11 +180,15 @@ __kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2 |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a == b ? (FLT4)1.f : (FLT4).0f); |
|
|
|
FLT4 result = a == b ? (FLT4)1.f : (FLT4).0f; |
|
|
|
// error? |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
__write_only image2d_t output, const int2 output_shape, float act_min, |
|
|
|
float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -181,11 +197,13 @@ __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only ima |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a != b ? (FLT4)1.f : (FLT4).0f); |
|
|
|
FLT4 result = a != b ? (FLT4)1.f : (FLT4).0f; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -194,11 +212,14 @@ __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a < b ? (FLT4)1.f : (FLT4).0f); |
|
|
|
FLT4 result = a < b ? (FLT4)1.f : (FLT4).0f; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
__write_only image2d_t output, const int2 output_shape, float act_min, |
|
|
|
float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -207,11 +228,13 @@ __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only im |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a <= b ? (FLT4)1.f : (FLT4).0f); |
|
|
|
FLT4 result = a <= b ? (FLT4)1.f : (FLT4).0f; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -220,11 +243,14 @@ __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only imag |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a > b ? (FLT4)1.f : (FLT4).0f); |
|
|
|
FLT4 result = a > b ? (FLT4)1.f : (FLT4).0f; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int2 output_shape) { |
|
|
|
__write_only image2d_t output, const int2 output_shape, float act_min, |
|
|
|
float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -233,58 +259,117 @@ __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a >= b ? (FLT4)1.f : (FLT4).0f); |
|
|
|
FLT4 result = a >= b ? (FLT4)1.f : (FLT4).0f; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastNHWC4Add_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int4 a_shape, const int4 b_shape, |
|
|
|
const int4 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); // C4 |
|
|
|
int Y = get_global_id(1); // W |
|
|
|
int Z = get_global_id(2); // N * H |
|
|
|
int N = Z / output_shape.y; |
|
|
|
int H = Z % output_shape.y; |
|
|
|
if (X >= output_shape.w || Y >= output_shape.z || Z >= output_shape.x * output_shape.y) { |
|
|
|
return; |
|
|
|
} |
|
|
|
int a_c = X < a_shape.w ? X : a_shape.w - 1; |
|
|
|
int a_w = Y < a_shape.z ? Y : a_shape.z - 1; |
|
|
|
int a_h = H < a_shape.y ? H : a_shape.y - 1; |
|
|
|
int a_n = N < a_shape.x ? N : a_shape.x - 1; |
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(a_w * a_shape.w + a_c, a_n * a_shape.y + a_h)); |
|
|
|
int b_c = X < b_shape.w ? X : b_shape.w - 1; |
|
|
|
int b_w = Y < b_shape.z ? Y : b_shape.z - 1; |
|
|
|
int b_h = H < b_shape.y ? H : b_shape.y - 1; |
|
|
|
int b_n = N < b_shape.x ? N : b_shape.x - 1; |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_n * b_shape.y + b_h)); |
|
|
|
FLT4 result = a + b; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, N * output_shape.y + H), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastNHWC4Sub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int4 a_shape, const int4 b_shape, |
|
|
|
const int4 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); // C4 |
|
|
|
int Y = get_global_id(1); // W |
|
|
|
int Z = get_global_id(2); // N * H |
|
|
|
int N = Z / output_shape.y; |
|
|
|
int H = Z % output_shape.y; |
|
|
|
if (X >= output_shape.w || Y >= output_shape.z || Z >= output_shape.x * output_shape.y) { |
|
|
|
return; |
|
|
|
} |
|
|
|
int a_c = X < a_shape.w ? X : a_shape.w - 1; |
|
|
|
int a_w = Y < a_shape.z ? Y : a_shape.z - 1; |
|
|
|
int a_h = H < a_shape.y ? H : a_shape.y - 1; |
|
|
|
int a_n = N < a_shape.x ? N : a_shape.x - 1; |
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(a_w * a_shape.w + a_c, a_n * a_shape.y + a_h)); |
|
|
|
int b_c = X < b_shape.w ? X : b_shape.w - 1; |
|
|
|
int b_w = Y < b_shape.z ? Y : b_shape.z - 1; |
|
|
|
int b_h = H < b_shape.y ? H : b_shape.y - 1; |
|
|
|
int b_n = N < b_shape.x ? N : b_shape.x - 1; |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_n * b_shape.y + b_h)); |
|
|
|
FLT4 result = a - b; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, N * output_shape.y + H), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastNHWC4Mul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int4 a_shape, const int4 b_shape, |
|
|
|
const int4 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); // C4 |
|
|
|
int Y = get_global_id(1); // W |
|
|
|
int Z = get_global_id(2); // N * H |
|
|
|
int N = Z / output_shape.y; |
|
|
|
int H = Z % output_shape.y; |
|
|
|
if (X >= output_shape.w || Y >= output_shape.z || Z >= output_shape.x * output_shape.y) { |
|
|
|
return; |
|
|
|
} |
|
|
|
int a_c = X < a_shape.w ? X : a_shape.w - 1; |
|
|
|
int a_w = Y < a_shape.z ? Y : a_shape.z - 1; |
|
|
|
int a_h = H < a_shape.y ? H : a_shape.y - 1; |
|
|
|
int a_n = N < a_shape.x ? N : a_shape.x - 1; |
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(a_w * a_shape.w + a_c, a_n * a_shape.y + a_h)); |
|
|
|
int b_c = X < b_shape.w ? X : b_shape.w - 1; |
|
|
|
int b_w = Y < b_shape.z ? Y : b_shape.z - 1; |
|
|
|
int b_h = H < b_shape.y ? H : b_shape.y - 1; |
|
|
|
int b_n = N < b_shape.x ? N : b_shape.x - 1; |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_n * b_shape.y + b_h)); |
|
|
|
FLT4 result = a * b; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, N * output_shape.y + H), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastNHWC4Div_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, |
|
|
|
__write_only image2d_t output, const int4 a_shape, const int4 b_shape, |
|
|
|
const int4 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); // C4 |
|
|
|
int Y = get_global_id(1); // W |
|
|
|
int Z = get_global_id(2); // N * H |
|
|
|
int N = Z / output_shape.y; |
|
|
|
int H = Z % output_shape.y; |
|
|
|
if (X >= output_shape.w || Y >= output_shape.z || Z >= output_shape.x * output_shape.y) { |
|
|
|
return; |
|
|
|
} |
|
|
|
int a_c = X < a_shape.w ? X : a_shape.w - 1; |
|
|
|
int a_w = Y < a_shape.z ? Y : a_shape.z - 1; |
|
|
|
int a_h = H < a_shape.y ? H : a_shape.y - 1; |
|
|
|
int a_n = N < a_shape.x ? N : a_shape.x - 1; |
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(a_w * a_shape.w + a_c, a_n * a_shape.y + a_h)); |
|
|
|
int b_c = X < b_shape.w ? X : b_shape.w - 1; |
|
|
|
int b_w = Y < b_shape.z ? Y : b_shape.z - 1; |
|
|
|
int b_h = H < b_shape.y ? H : b_shape.y - 1; |
|
|
|
int b_n = N < b_shape.x ? N : b_shape.x - 1; |
|
|
|
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_n * b_shape.y + b_h)); |
|
|
|
FLT4 result = a / b; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, N * output_shape.y + H), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastAdd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
return; |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a + (FLT)b); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastSub_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
return; |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a - (FLT)b); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastMul_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
return; |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a * (FLT)b); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
return; |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, (FLT)b)); |
|
|
|
} |
|
|
|
__kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -292,11 +377,13 @@ __kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_o |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(AS_UINT4(a) & (UINT4)((FLT)b))); |
|
|
|
FLT4 result = AS_FLT4(AS_UINT4(a) & (UINT4)((FLT)b)); |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -304,11 +391,13 @@ __kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_on |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(AS_UINT4(a) | (UINT4)((FLT)b))); |
|
|
|
FLT4 result = AS_FLT4(AS_UINT4(a) | (UINT4)((FLT)b)); |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -316,11 +405,13 @@ __kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_o |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), max(a, (FLT4)b)); |
|
|
|
FLT4 result = max(a, (FLT4)b); |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -328,11 +419,13 @@ __kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_o |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), min(a, (FLT4)b)); |
|
|
|
FLT4 result = min(a, (FLT4)b); |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -340,11 +433,13 @@ __kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __wr |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, (FLT4)b))); |
|
|
|
FLT4 result = floor(divide_no_check(a, (FLT4)b)); |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -352,11 +447,13 @@ __kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __wr |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, (FLT4)b)) * (FLT)b); |
|
|
|
FLT4 result = floor(divide_no_check(a, (FLT4)b)) * (FLT)b; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -364,11 +461,13 @@ __kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, floa |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), pown((a - (FLT4)b), (int4)2)); |
|
|
|
FLT4 result = pown((a - (FLT4)b), (int4)2); |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -376,11 +475,13 @@ __kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a == (FLT4)b ? (FLT4)1.f : (FLT4).0f); |
|
|
|
FLT4 result = a == (FLT4)b ? (FLT4)1.f : (FLT4).0f; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -388,11 +489,13 @@ __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __wr |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a != (FLT4)b ? (FLT4)1.f : (FLT4).0f); |
|
|
|
FLT4 result = a != (FLT4)b ? (FLT4)1.f : (FLT4).0f; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -400,11 +503,13 @@ __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_ |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a < (FLT4)b ? (FLT4)1.f : (FLT4).0f); |
|
|
|
FLT4 result = a < (FLT4)b ? (FLT4)1.f : (FLT4).0f; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -412,11 +517,13 @@ __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __w |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a <= (FLT4)b ? (FLT4)1.f : (FLT4).0f); |
|
|
|
FLT4 result = a <= (FLT4)b ? (FLT4)1.f : (FLT4).0f; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -424,11 +531,13 @@ __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __wri |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a > (FLT4)b ? (FLT4)1.f : (FLT4).0f); |
|
|
|
FLT4 result = a > (FLT4)b ? (FLT4)1.f : (FLT4).0f; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, |
|
|
|
const int2 output_shape) { |
|
|
|
const int2 output_shape, float act_min, float act_max) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
if (X >= output_shape.x || Y >= output_shape.y) { |
|
|
|
@@ -436,7 +545,9 @@ __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), a >= (FLT4)b ? (FLT4)1.f : (FLT4).0f); |
|
|
|
FLT4 result = a >= (FLT4)b ? (FLT4)1.f : (FLT4).0f; |
|
|
|
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); |
|
|
|
WRITE_IMAGE(output, (int2)(X, Y), result); |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output, |
|
|
|
|