|
|
|
@@ -1,8 +1,9 @@ |
|
|
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable |
|
|
|
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; |
|
|
|
|
|
|
|
__kernel void Concat(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output, |
|
|
|
int4 input_shape0, int4 input_shape1, int4 output_shape, const int axis) { |
|
|
|
__kernel void Concat2input_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1, |
|
|
|
__write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 output_shape, |
|
|
|
const int axis) { |
|
|
|
int X = get_global_id(0); // N*H |
|
|
|
int Y = get_global_id(1); // W |
|
|
|
int Z = get_global_id(2); // c/4 |
|
|
|
@@ -44,9 +45,9 @@ __kernel void Concat(__read_only image2d_t input0, __read_only image2d_t input1, |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void Concat3input(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2, |
|
|
|
__write_only image2d_t output, int4 input_shape0, int4 input_shape1, int4 input_shape2, |
|
|
|
int4 output_shape, const int axis) { |
|
|
|
__kernel void Concat3input_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1, |
|
|
|
__read_only image2d_t input2, __write_only image2d_t output, int4 input_shape0, |
|
|
|
int4 input_shape1, int4 input_shape2, int4 output_shape, const int axis) { |
|
|
|
int X = get_global_id(0); // N*H |
|
|
|
int Y = get_global_id(1); // W |
|
|
|
int Z = get_global_id(2); // c/4 |
|
|
|
@@ -105,3 +106,144 @@ __kernel void Concat3input(__read_only image2d_t input0, __read_only image2d_t i |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void Concat2input_NC4HW4(__read_only image2d_t input0, __read_only image2d_t input1, |
|
|
|
__write_only image2d_t output, int4 input_shape0, int4 input_shape1, |
|
|
|
int4 output_shape, const int axis) { |
|
|
|
int X = get_global_id(0); // H |
|
|
|
int Y = get_global_id(1); // W |
|
|
|
int Z = get_global_id(2); // c/4 |
|
|
|
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { |
|
|
|
return; |
|
|
|
} |
|
|
|
if (input_shape0.y == 0 || input_shape1.y == 0 || output_shape.y == 0) { |
|
|
|
return; |
|
|
|
} |
|
|
|
int in_postion_x; |
|
|
|
int out_pos_x = (X / output_shape.y) * output_shape.w * output_shape.y + Z * output_shape.y + X % output_shape.y; |
|
|
|
if (axis == 0) { |
|
|
|
if (X < (input_shape0.x * input_shape0.y)) { |
|
|
|
in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y; |
|
|
|
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} else { |
|
|
|
in_postion_x = ((X - input_shape0.x * input_shape0.y) / input_shape1.y) * input_shape1.w * input_shape1.y + |
|
|
|
Z * input_shape1.y + ((X - input_shape0.x * input_shape0.y) % input_shape1.y); |
|
|
|
FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} |
|
|
|
} else if (axis == 1) { |
|
|
|
if (X < input_shape0.y) { |
|
|
|
in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y; |
|
|
|
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} else { |
|
|
|
in_postion_x = ((X - input_shape0.y) / input_shape1.y) * input_shape1.w * input_shape1.y + Z * input_shape1.y + |
|
|
|
((X - input_shape0.y) % input_shape1.y); |
|
|
|
FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} |
|
|
|
} else if (axis == 2) { |
|
|
|
if (Y < input_shape0.z) { |
|
|
|
in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y; |
|
|
|
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} else { |
|
|
|
in_postion_x = (X / input_shape1.y) * input_shape1.w * input_shape1.y + Z * input_shape1.y + (X % input_shape1.y); |
|
|
|
FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} |
|
|
|
} else { |
|
|
|
if (Z < input_shape0.w) { |
|
|
|
in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y; |
|
|
|
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} else { |
|
|
|
in_postion_x = (X / input_shape1.y) * input_shape1.w * input_shape1.y + (Z - input_shape0.w) * input_shape1.y + |
|
|
|
(X % input_shape1.y); |
|
|
|
FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void Concat3input_NC4HW4(__read_only image2d_t input0, __read_only image2d_t input1, |
|
|
|
__read_only image2d_t input2, __write_only image2d_t output, int4 input_shape0, |
|
|
|
int4 input_shape1, int4 input_shape2, int4 output_shape, const int axis) { |
|
|
|
int X = get_global_id(0); // N*H |
|
|
|
int Y = get_global_id(1); // W |
|
|
|
int Z = get_global_id(2); // c/4 |
|
|
|
if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { |
|
|
|
return; |
|
|
|
} |
|
|
|
if (input_shape0.y == 0 || input_shape1.y == 0 || input_shape2.y == 0 || output_shape.y == 0) { |
|
|
|
return; |
|
|
|
} |
|
|
|
int in_postion_x; |
|
|
|
int out_pos_x = (X / output_shape.y) * output_shape.w * output_shape.y + Z * output_shape.y + X % output_shape.y; |
|
|
|
if (axis == 0) { |
|
|
|
if (X < (input_shape0.x * input_shape0.y)) { |
|
|
|
in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y; |
|
|
|
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} else if (X < (input_shape0.x * input_shape0.y + input_shape1.x * input_shape1.y)) { |
|
|
|
in_postion_x = ((X - input_shape0.x * input_shape0.y) / input_shape1.y) * input_shape1.w * input_shape1.y + |
|
|
|
Z * input_shape1.y + ((X - input_shape0.x * input_shape0.y) % input_shape1.y); |
|
|
|
FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} else { |
|
|
|
in_postion_x = ((X - input_shape0.x * input_shape0.y - input_shape1.x * input_shape1.y) / input_shape2.y) * |
|
|
|
input_shape2.w * input_shape2.y + |
|
|
|
Z * input_shape2.y + |
|
|
|
(X - input_shape0.x * input_shape0.y - input_shape1.x * input_shape1.y) % input_shape2.y; |
|
|
|
FLT4 result = READ_IMAGE(input2, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} |
|
|
|
} else if (axis == 1) { |
|
|
|
if (X < input_shape0.y) { |
|
|
|
in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y; |
|
|
|
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} else if (X < input_shape0.y + input_shape1.y) { |
|
|
|
in_postion_x = ((X - input_shape0.y) / input_shape1.y) * input_shape1.w * input_shape1.y + Z * input_shape1.y + |
|
|
|
((X - input_shape0.y) % input_shape1.y); |
|
|
|
FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} else { |
|
|
|
in_postion_x = ((X - input_shape0.y - input_shape1.y) / input_shape2.y) * input_shape2.w * input_shape2.y + |
|
|
|
Z * input_shape2.y + ((X - input_shape0.y - input_shape1.y) % input_shape2.y); |
|
|
|
FLT4 result = READ_IMAGE(input2, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} |
|
|
|
} else if (axis == 2) { |
|
|
|
if (Y < input_shape0.z) { |
|
|
|
in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y; |
|
|
|
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} else if (Y < input_shape0.z + input_shape1.z) { |
|
|
|
in_postion_x = (X / input_shape1.y) * input_shape1.w * input_shape1.y + Z * input_shape1.y + (X % input_shape1.y); |
|
|
|
FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y - input_shape0.z), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} else { |
|
|
|
in_postion_x = (X / input_shape2.y) * input_shape2.w * input_shape2.y + Z * input_shape2.y + (X % input_shape2.y); |
|
|
|
FLT4 result = READ_IMAGE(input2, smp_none, (int2)((Y - input_shape0.z - input_shape1.z), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} |
|
|
|
} else { |
|
|
|
if (Z < input_shape0.w) { |
|
|
|
in_postion_x = (X / input_shape0.y) * input_shape0.w * input_shape0.y + Z * input_shape0.y + X % input_shape0.y; |
|
|
|
FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} else if (Z < input_shape0.w + input_shape1.w) { |
|
|
|
in_postion_x = (X / input_shape1.y) * input_shape1.w * input_shape1.y + (Z - input_shape0.w) * input_shape1.y + |
|
|
|
(X % input_shape1.y); |
|
|
|
FLT4 result = READ_IMAGE(input1, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} else { |
|
|
|
in_postion_x = (X / input_shape2.y) * input_shape2.w * input_shape2.y + |
|
|
|
(Z - input_shape0.w - input_shape1.w) * input_shape2.y + (X % input_shape2.y); |
|
|
|
FLT4 result = READ_IMAGE(input2, smp_none, (int2)((Y), in_postion_x)); |
|
|
|
WRITE_IMAGE(output, (int2)((Y), out_pos_x), result); |
|
|
|
} |
|
|
|
} |
|
|
|
} |