|
|
|
@@ -1,34 +1,88 @@ |
|
|
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable |
|
|
|
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; |
|
|
|
__kernel void to_format_NHWC_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
__kernel void to_format_NHWC_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
int Z = get_global_id(2); |
|
|
|
if (X >= size.x || Y >= size.y || Z >= size.z) { |
|
|
|
return; |
|
|
|
} |
|
|
|
FLT4 data = (FLT4)(0.f); |
|
|
|
int offset = (X * shape.z + Y) * shape.w + Z * 4; |
|
|
|
__global FLT *src_addr = (__global FLT *)src_data; |
|
|
|
__global float *src_addr = (__global float *)src_data; |
|
|
|
src_addr += offset; |
|
|
|
if ((Z + 1) * 4 <= shape.w) { |
|
|
|
data = TO_FLT4(((__global float4 *)src_addr)[0]); |
|
|
|
} else { |
|
|
|
if ((shape.w - Z * 4) >= 1) { |
|
|
|
data.x = (FLT)src_addr[0]; |
|
|
|
} |
|
|
|
if ((shape.w - Z * 4) >= 2) { |
|
|
|
data.y = (FLT)src_addr[1]; |
|
|
|
} |
|
|
|
if ((shape.w - Z * 4) >= 3) { |
|
|
|
data.z = (FLT)src_addr[2]; |
|
|
|
} |
|
|
|
} |
|
|
|
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); |
|
|
|
} |
|
|
|
__kernel void to_format_NHWC_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
int Z = get_global_id(2); |
|
|
|
if (X >= size.x || Y >= size.y || Z >= size.z) { |
|
|
|
return; |
|
|
|
} |
|
|
|
FLT4 data = (FLT4)(0.f); |
|
|
|
int offset = (X * shape.z + Y) * shape.w + Z * 4; |
|
|
|
__global half *src_addr = (__global half *)src_data; |
|
|
|
src_addr += offset; |
|
|
|
if ((Z + 1) * 4 <= shape.w) { |
|
|
|
data = ((__global FLT4 *)src_addr)[0]; |
|
|
|
data = TO_FLT4(((__global half4 *)src_addr)[0]); |
|
|
|
} else { |
|
|
|
if ((shape.w - Z * 4) >= 1) { |
|
|
|
data.x = src_addr[0]; |
|
|
|
data.x = (FLT)src_addr[0]; |
|
|
|
} |
|
|
|
if ((shape.w - Z * 4) >= 2) { |
|
|
|
data.y = src_addr[1]; |
|
|
|
data.y = (FLT)src_addr[1]; |
|
|
|
} |
|
|
|
if ((shape.w - Z * 4) >= 3) { |
|
|
|
data.z = src_addr[2]; |
|
|
|
data.z = (FLT)src_addr[2]; |
|
|
|
} |
|
|
|
} |
|
|
|
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); |
|
|
|
} |
|
|
|
__kernel void to_format_NHWC_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
__kernel void to_format_NHWC_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
int Z = get_global_id(2); |
|
|
|
if (X >= size.x || Y >= size.y || Z >= size.z) { |
|
|
|
return; |
|
|
|
} |
|
|
|
int offset = (X * shape.z + Y) * shape.w + Z * 4; |
|
|
|
__global float *src_addr = (__global float *)src_data; |
|
|
|
src_addr += offset; |
|
|
|
FLT4 data = (FLT4)(0.f); |
|
|
|
if ((Z + 1) * 4 <= shape.w) { |
|
|
|
data = TO_FLT4(((__global float4 *)src_addr)[0]); |
|
|
|
} else { |
|
|
|
if ((shape.w - Z * 4) >= 1) { |
|
|
|
data.x = (FLT)src_addr[0]; |
|
|
|
} |
|
|
|
if ((shape.w - Z * 4) >= 2) { |
|
|
|
data.y = (FLT)src_addr[1]; |
|
|
|
} |
|
|
|
if ((shape.w - Z * 4) >= 3) { |
|
|
|
data.z = (FLT)src_addr[2]; |
|
|
|
} |
|
|
|
} |
|
|
|
WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), data); |
|
|
|
} |
|
|
|
__kernel void to_format_NHWC_to_NC4HW4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
int Z = get_global_id(2); |
|
|
|
@@ -36,36 +90,57 @@ __kernel void to_format_NHWC_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only |
|
|
|
return; |
|
|
|
} |
|
|
|
int offset = (X * shape.z + Y) * shape.w + Z * 4; |
|
|
|
__global FLT *src_addr = (__global FLT *)src_data; |
|
|
|
__global half *src_addr = (__global half *)src_data; |
|
|
|
src_addr += offset; |
|
|
|
FLT4 data = (FLT4)(0.f); |
|
|
|
if ((Z + 1) * 4 <= shape.w) { |
|
|
|
data = ((__global FLT4 *)src_addr)[0]; |
|
|
|
data = TO_FLT4(((__global half4 *)src_addr)[0]); |
|
|
|
} else { |
|
|
|
if ((shape.w - Z * 4) >= 1) { |
|
|
|
data.x = src_addr[0]; |
|
|
|
data.x = (FLT)src_addr[0]; |
|
|
|
} |
|
|
|
if ((shape.w - Z * 4) >= 2) { |
|
|
|
data.y = src_addr[1]; |
|
|
|
data.y = (FLT)src_addr[1]; |
|
|
|
} |
|
|
|
if ((shape.w - Z * 4) >= 3) { |
|
|
|
data.z = src_addr[2]; |
|
|
|
data.z = (FLT)src_addr[2]; |
|
|
|
} |
|
|
|
} |
|
|
|
WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), data); |
|
|
|
} |
|
|
|
__kernel void to_format_NHWC4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
__kernel void to_format_NHWC4_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
int Z = get_global_id(2); |
|
|
|
if (X >= size.x || Y >= size.y || Z >= size.z) { |
|
|
|
return; |
|
|
|
} |
|
|
|
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), TO_FLT4(src_data[(X * size.y + Y) * size.z + Z])); |
|
|
|
} |
|
|
|
__kernel void to_format_NHWC4_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
int Z = get_global_id(2); |
|
|
|
if (X >= size.x || Y >= size.y || Z >= size.z) { |
|
|
|
return; |
|
|
|
} |
|
|
|
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), src_data[(X * size.y + Y) * size.z + Z]); |
|
|
|
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), TO_FLT4(src_data[(X * size.y + Y) * size.z + Z])); |
|
|
|
} |
|
|
|
__kernel void to_format_NC4HW4_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, |
|
|
|
int4 size, int4 shape) { |
|
|
|
// size(h, w, c4, 1), shape(n, c, h, w) |
|
|
|
int X = get_global_id(0); // h |
|
|
|
int Y = get_global_id(1); // w |
|
|
|
int Z = get_global_id(2); // c4 |
|
|
|
if (X >= size.x || Y >= size.y || Z >= size.z) { |
|
|
|
return; |
|
|
|
} |
|
|
|
WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), TO_FLT4(src_data[(Z * size.x + X) * size.y + Y])); |
|
|
|
} |
|
|
|
__kernel void to_format_NC4HW4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
__kernel void to_format_NC4HW4_to_NC4HW4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
// size(h, w, c4, 1), shape(n, c, h, w) |
|
|
|
int X = get_global_id(0); // h |
|
|
|
int Y = get_global_id(1); // w |
|
|
|
@@ -73,32 +148,94 @@ __kernel void to_format_NC4HW4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_on |
|
|
|
if (X >= size.x || Y >= size.y || Z >= size.z) { |
|
|
|
return; |
|
|
|
} |
|
|
|
WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), src_data[(Z * size.x + X) * size.y + Y]); |
|
|
|
WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), TO_FLT4(src_data[(Z * size.x + X) * size.y + Y])); |
|
|
|
} |
|
|
|
__kernel void to_format_NCHW_to_NCHW_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
int Z = get_global_id(2); |
|
|
|
if (X >= size.x || Y >= size.y || Z >= size.z) { |
|
|
|
return; |
|
|
|
} |
|
|
|
dst_data[(Z * size.y + Y) * size.x + X] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z))); |
|
|
|
} |
|
|
|
__kernel void to_format_NCHW_to_NCHW_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
int Z = get_global_id(2); |
|
|
|
if (X >= size.x || Y >= size.y || Z >= size.z) { |
|
|
|
return; |
|
|
|
} |
|
|
|
dst_data[(Z * size.y + Y) * size.x + X] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z))); |
|
|
|
} |
|
|
|
__kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
int Z = get_global_id(2); |
|
|
|
if (X >= size.x || Y >= size.y || Z >= size.z) { |
|
|
|
return; |
|
|
|
} |
|
|
|
float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); |
|
|
|
int offset = (X * shape.z + Y) * shape.w + Z * 4; |
|
|
|
__global float *dst_addr = (__global float *)dst_data; |
|
|
|
dst_addr += offset; |
|
|
|
if ((Z + 1) * 4 <= shape.w) { |
|
|
|
((__global float4 *)dst_addr)[0] = data; |
|
|
|
} else { |
|
|
|
if (shape.w - Z * 4 >= 1) { |
|
|
|
dst_addr[0] = data.x; |
|
|
|
} |
|
|
|
if (shape.w - Z * 4 >= 2) { |
|
|
|
dst_addr[1] = data.y; |
|
|
|
} |
|
|
|
if (shape.w - Z * 4 >= 3) { |
|
|
|
dst_addr[2] = data.z; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
__kernel void to_format_NCHW_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
__kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
int Z = get_global_id(2); |
|
|
|
if (X >= size.x || Y >= size.y || Z >= size.z) { |
|
|
|
return; |
|
|
|
} |
|
|
|
dst_data[(Z * size.y + Y) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z)); |
|
|
|
half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); |
|
|
|
int offset = (X * shape.z + Y) * shape.w + Z * 4; |
|
|
|
__global half *dst_addr = (__global half *)dst_data; |
|
|
|
dst_addr += offset; |
|
|
|
if ((Z + 1) * 4 <= shape.w) { |
|
|
|
((__global half4 *)dst_addr)[0] = data; |
|
|
|
} else { |
|
|
|
if (shape.w - Z * 4 >= 1) { |
|
|
|
dst_addr[0] = data.x; |
|
|
|
} |
|
|
|
if (shape.w - Z * 4 >= 2) { |
|
|
|
dst_addr[1] = data.y; |
|
|
|
} |
|
|
|
if (shape.w - Z * 4 >= 3) { |
|
|
|
dst_addr[2] = data.z; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
__kernel void to_format_NHWC4_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
__kernel void to_format_NC4HW4_to_NHWC_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
int Z = get_global_id(2); |
|
|
|
if (X >= size.x || Y >= size.y || Z >= size.z) { |
|
|
|
return; |
|
|
|
} |
|
|
|
FLT4 data = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)); |
|
|
|
float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X))); |
|
|
|
int offset = (X * shape.z + Y) * shape.w + Z * 4; |
|
|
|
__global FLT *dst_addr = (__global FLT *)dst_data; |
|
|
|
__global float *dst_addr = (__global float *)dst_data; |
|
|
|
dst_addr += offset; |
|
|
|
if ((Z + 1) * 4 <= shape.w) { |
|
|
|
((__global FLT4 *)dst_addr)[0] = data; |
|
|
|
((__global float4 *)dst_addr)[0] = data; |
|
|
|
} else { |
|
|
|
if (shape.w - Z * 4 >= 1) { |
|
|
|
dst_addr[0] = data.x; |
|
|
|
@@ -111,20 +248,20 @@ __kernel void to_format_NHWC4_to_NHWC_BUF(__read_only image2d_t src_data, __glob |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
__kernel void to_format_NC4HW4_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
__kernel void to_format_NC4HW4_to_NHWC_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
int Z = get_global_id(2); |
|
|
|
if (X >= size.x || Y >= size.y || Z >= size.z) { |
|
|
|
return; |
|
|
|
} |
|
|
|
FLT4 data = READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X)); |
|
|
|
half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X))); |
|
|
|
int offset = (X * shape.z + Y) * shape.w + Z * 4; |
|
|
|
__global FLT *dst_addr = (__global FLT *)dst_data; |
|
|
|
__global half *dst_addr = (__global half *)dst_data; |
|
|
|
dst_addr += offset; |
|
|
|
if ((Z + 1) * 4 <= shape.w) { |
|
|
|
((__global FLT4 *)dst_addr)[0] = data; |
|
|
|
((__global half4 *)dst_addr)[0] = data; |
|
|
|
} else { |
|
|
|
if (shape.w - Z * 4 >= 1) { |
|
|
|
dst_addr[0] = data.x; |
|
|
|
@@ -137,8 +274,19 @@ __kernel void to_format_NC4HW4_to_NHWC_BUF(__read_only image2d_t src_data, __glo |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
__kernel void to_format_NC4HW4_to_NC4HW4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
__kernel void to_format_NC4HW4_to_NC4HW4_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
// size(h, w, c, 1), shape(n, c, h, w) |
|
|
|
int X = get_global_id(0); // h |
|
|
|
int Y = get_global_id(1); // w |
|
|
|
int Z = get_global_id(2); // c |
|
|
|
if (X >= size.x || Y >= size.y || Z >= size.z) { |
|
|
|
return; |
|
|
|
} |
|
|
|
dst_data[(Z * size.x + X) * size.y + Y] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X))); |
|
|
|
} |
|
|
|
__kernel void to_format_NC4HW4_to_NC4HW4_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
// size(h, w, c, 1), shape(n, c, h, w) |
|
|
|
int X = get_global_id(0); // h |
|
|
|
int Y = get_global_id(1); // w |
|
|
|
@@ -146,15 +294,15 @@ __kernel void to_format_NC4HW4_to_NC4HW4_BUF(__read_only image2d_t src_data, __g |
|
|
|
if (X >= size.x || Y >= size.y || Z >= size.z) { |
|
|
|
return; |
|
|
|
} |
|
|
|
dst_data[(Z * size.x + X) * size.y + Y] = READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X)); |
|
|
|
dst_data[(Z * size.x + X) * size.y + Y] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X))); |
|
|
|
} |
|
|
|
__kernel void to_format_NHWC4_to_NHWC4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
__kernel void to_format_NHWC4_to_NHWC4_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, |
|
|
|
int4 shape) { |
|
|
|
int X = get_global_id(0); |
|
|
|
int Y = get_global_id(1); |
|
|
|
int Z = get_global_id(2); |
|
|
|
if (X >= size.x || Y >= size.y || Z >= size.z) { |
|
|
|
return; |
|
|
|
} |
|
|
|
dst_data[(X * size.y + Y) * size.z + Z] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)); |
|
|
|
dst_data[(X * size.y + Y) * size.z + Z] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); |
|
|
|
} |