|
|
|
@@ -3,268 +3,420 @@ |
|
|
|
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; |
|
|
|
|
|
|
|
#define CI_TILE 4 |
|
|
|
#define CO_TILE 4 |
|
|
|
#define MAX_IMAGE2D_SIZE 65535 |
|
|
|
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) |
|
|
|
|
|
|
|
#define ActType_No 0 |
|
|
|
#define ActType_Relu 1 |
|
|
|
#define ActType_Sigmod 2 |
|
|
|
#define ActType_Relu6 3 |
|
|
|
|
|
|
|
__kernel void Convolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, |
|
|
|
__global FLT4 *bias, const int4 input_shape, const int4 output_shape, |
|
|
|
const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { |
|
|
|
const int N = input_shape.x; |
|
|
|
const int IH = input_shape.y; |
|
|
|
const int IW = input_shape.z; |
|
|
|
const int CI_SLICES = input_shape.w; |
|
|
|
|
|
|
|
const int OH = output_shape.y; |
|
|
|
const int OW = output_shape.z; |
|
|
|
const int CO_SLICES = output_shape.w; |
|
|
|
|
|
|
|
const int KH = kernel_stride.x; |
|
|
|
const int KW = kernel_stride.y; |
|
|
|
const int strideH = kernel_stride.z; |
|
|
|
const int strideW = kernel_stride.w; |
|
|
|
|
|
|
|
const int padTop = pad.x; |
|
|
|
const int padLeft = pad.z; |
|
|
|
|
|
|
|
const int dilationH = dilation.x; |
|
|
|
const int dilationW = dilation.y; |
|
|
|
|
|
|
|
int n_oh = get_global_id(0); // [0, N*OH) |
|
|
|
int ow = get_global_id(1); // [0, OW) |
|
|
|
int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) ) |
|
|
|
int n; |
|
|
|
int oh; |
|
|
|
if (N == 1) { |
|
|
|
n = 0; |
|
|
|
oh = n_oh; |
|
|
|
} else { |
|
|
|
n = n_oh / OH; |
|
|
|
oh = n_oh % OH; |
|
|
|
} |
|
|
|
if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) { |
|
|
|
return; |
|
|
|
#define DEFINE_ARGS \ |
|
|
|
const int N = input_shape.x; \ |
|
|
|
const int IH = input_shape.y, IW = input_shape.z, CI_SLICES = input_shape.w; \ |
|
|
|
const int OH = output_shape.y, OW = output_shape.z, CO_SLICES = output_shape.w; \ |
|
|
|
const int KH = kernel_stride.x, KW = kernel_stride.y; \ |
|
|
|
const int strideH = kernel_stride.z, strideW = kernel_stride.w; \ |
|
|
|
const int padTop = pad.x, padBottom = pad.y, padLeft = pad.z, padRight = pad.w; \ |
|
|
|
const int dilationH = dilation.x, dilationW = dilation.y; \ |
|
|
|
\ |
|
|
|
const int n_oh = get_global_id(0); \ |
|
|
|
const int ow = get_global_id(1) * BlockW; \ |
|
|
|
const int co_slice = get_global_id(2) * BlockC; \ |
|
|
|
const int OH_SLICES = UP_DIV(OH, BlockH); \ |
|
|
|
const int n = n_oh / OH_SLICES; \ |
|
|
|
const int oh = (n_oh % OH_SLICES) * BlockH; \ |
|
|
|
if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) { \ |
|
|
|
return; \ |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 out_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
__global FLT4 *w_ic1_oc4 = weight + co_slice * KH * KW * CI_SLICES * CI_TILE; |
|
|
|
__kernel void Convolution_H1W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, |
|
|
|
__global FLT4 *bias, const int4 input_shape, const int4 output_shape, |
|
|
|
const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { |
|
|
|
const int BlockH = 1; |
|
|
|
const int BlockW = 1; |
|
|
|
const int BlockC = 1; |
|
|
|
DEFINE_ARGS; |
|
|
|
|
|
|
|
const int oh0 = oh + 0; |
|
|
|
const int n_oh0 = n * OH + oh0; |
|
|
|
const int ow0 = ow + 0; |
|
|
|
const int co_slice0 = co_slice + 0; |
|
|
|
|
|
|
|
FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
|
|
|
|
__global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; |
|
|
|
|
|
|
|
for (int kh = 0; kh < KH; ++kh) { |
|
|
|
int ih = kh * dilationH + oh * strideH - padTop; |
|
|
|
const int ih0 = kh * dilationH + oh0 * strideH - padTop; |
|
|
|
const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; |
|
|
|
|
|
|
|
for (int kw = 0; kw < KW; ++kw) { |
|
|
|
int iw = kw * dilationW + ow * strideW - padLeft; |
|
|
|
if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) { |
|
|
|
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { |
|
|
|
FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, n * IH + ih)); |
|
|
|
out_c4 += w_ic1_oc4[0] * in_c4.x; |
|
|
|
out_c4 += w_ic1_oc4[1] * in_c4.y; |
|
|
|
out_c4 += w_ic1_oc4[2] * in_c4.z; |
|
|
|
out_c4 += w_ic1_oc4[3] * in_c4.w; |
|
|
|
w_ic1_oc4 += 4; |
|
|
|
} |
|
|
|
} else { |
|
|
|
w_ic1_oc4 += 4 * CI_SLICES; |
|
|
|
const int iw0 = kw * dilationW + ow0 * strideW - padLeft; |
|
|
|
int x_idx0 = iw0 * CI_SLICES; |
|
|
|
|
|
|
|
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { |
|
|
|
FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0)); |
|
|
|
x_idx0++; |
|
|
|
|
|
|
|
out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x; |
|
|
|
out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y; |
|
|
|
out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z; |
|
|
|
out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w; |
|
|
|
|
|
|
|
weight_ptr += 4; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if (bias != 0) { |
|
|
|
out_c4 = out_c4 + bias[co_slice]; |
|
|
|
if (bias) { |
|
|
|
out_h0_w0_c0 += bias[co_slice0]; |
|
|
|
} |
|
|
|
|
|
|
|
// activation |
|
|
|
if (act_type == ActType_Relu) { |
|
|
|
out_c4 = max(out_c4, (FLT4)(0.0f)); |
|
|
|
out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); |
|
|
|
} else if (act_type == ActType_Relu6) { |
|
|
|
out_c4 = clamp(out_c4, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
} |
|
|
|
|
|
|
|
if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { |
|
|
|
WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out_c4); |
|
|
|
|
|
|
|
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); |
|
|
|
} else { |
|
|
|
WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out_c4); |
|
|
|
WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
constant FLT Bt[36] = { |
|
|
|
1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f, |
|
|
|
0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f, |
|
|
|
0.0000000000f, -0.9428089857f, 1.3333334923f, 0.4714045525f, -0.6666667461f, 0.0000000000f, |
|
|
|
0.0000000000f, -0.1178511307f, -0.0833333358f, 0.2357022613f, 0.1666666865f, 0.0000000000f, |
|
|
|
0.0000000000f, 0.1178511307f, -0.0833333507f, -0.2357022911f, 0.1666666865f, 0.0000000000f, |
|
|
|
0.0000000000f, 0.9999998808f, -0.0000000596f, -2.5000000000f, 0.0000000000f, 1.0000000000f, |
|
|
|
}; |
|
|
|
|
|
|
|
__kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output, |
|
|
|
const int4 input_shape, // N H W CI_SLICES |
|
|
|
const int4 output_shape) { // N 36 H/4*W/4 CI_SLICES |
|
|
|
#define PAD 1 |
|
|
|
int tile_xy = get_global_id(0); |
|
|
|
int row = get_global_id(1); |
|
|
|
int slice = get_global_id(2); |
|
|
|
|
|
|
|
int TILE_XY = output_shape.z; |
|
|
|
int SLICES = input_shape.w; |
|
|
|
if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES) { |
|
|
|
return; |
|
|
|
} |
|
|
|
__kernel void Convolution_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, |
|
|
|
__global FLT4 *bias, const int4 input_shape, const int4 output_shape, |
|
|
|
const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { |
|
|
|
const int BlockH = 2; |
|
|
|
const int BlockW = 1; |
|
|
|
const int BlockC = 1; |
|
|
|
DEFINE_ARGS; |
|
|
|
|
|
|
|
const int oh0 = oh + 0; |
|
|
|
const int oh1 = oh + 1; |
|
|
|
const int n_oh0 = n * OH + oh0; |
|
|
|
const int n_oh1 = n * OH + oh1; |
|
|
|
const int ow0 = ow + 0; |
|
|
|
const int co_slice0 = co_slice + 0; |
|
|
|
|
|
|
|
int IW = input_shape.z; |
|
|
|
int TILE_X = UP_DIV(IW, 4); |
|
|
|
int tile_x = tile_xy % TILE_X; |
|
|
|
int tile_y = tile_xy / TILE_X; |
|
|
|
|
|
|
|
constant FLT *Bt_row = Bt + row * 6; |
|
|
|
FLT4 BtD_row[6] = {0}; |
|
|
|
|
|
|
|
int ih = tile_y * 4 - PAD; |
|
|
|
int iw = tile_x * 4 - PAD; |
|
|
|
for (int y = 0; y < 6; y++) { |
|
|
|
int x_idx = iw * SLICES + slice; |
|
|
|
for (int x = 0; x < 6; x++) { |
|
|
|
// no need to check iw: because slice is in [0, SLICES). when iw<0, x_idx<0; iw>=IW, x_idx>=IW*SLICES |
|
|
|
// if (iw < 0 || iw >= IW) { continue; } |
|
|
|
BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, ih)); |
|
|
|
x_idx += SLICES; |
|
|
|
FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
|
|
|
|
__global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; |
|
|
|
|
|
|
|
for (int kh = 0; kh < KH; ++kh) { |
|
|
|
const int ih0 = kh * dilationH + oh0 * strideH - padTop; |
|
|
|
// no need to check oh1, finally write out will check (oh1 < OH) |
|
|
|
const int ih1 = kh * dilationH + oh1 * strideH - padTop; |
|
|
|
// check ih0 and ih1 |
|
|
|
const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; |
|
|
|
const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; |
|
|
|
|
|
|
|
for (int kw = 0; kw < KW; ++kw) { |
|
|
|
const int iw0 = kw * dilationW + ow0 * strideW - padLeft; |
|
|
|
int x_idx0 = iw0 * CI_SLICES; |
|
|
|
|
|
|
|
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { |
|
|
|
FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0)); |
|
|
|
FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1)); |
|
|
|
x_idx0++; |
|
|
|
|
|
|
|
out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x; |
|
|
|
out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x; |
|
|
|
out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y; |
|
|
|
out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y; |
|
|
|
out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z; |
|
|
|
out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z; |
|
|
|
out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w; |
|
|
|
out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w; |
|
|
|
|
|
|
|
weight_ptr += 4; |
|
|
|
} |
|
|
|
} |
|
|
|
ih++; |
|
|
|
} |
|
|
|
|
|
|
|
int y_idx = slice * 36 + row * 6; |
|
|
|
for (int y = 0; y < 6; y++) { |
|
|
|
FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
for (int x = 0; x < 6; x++) { |
|
|
|
acc += BtD_row[x] * Bt[y * 6 + x]; |
|
|
|
} |
|
|
|
WRITE_IMAGE(output, (int2)(tile_xy, y_idx + y), acc); // CH W H=36 |
|
|
|
if (bias) { |
|
|
|
out_h0_w0_c0 += bias[co_slice0]; |
|
|
|
out_h1_w0_c0 += bias[co_slice0]; |
|
|
|
} |
|
|
|
|
|
|
|
if (act_type == ActType_Relu) { |
|
|
|
out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); |
|
|
|
out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); |
|
|
|
} else if (act_type == ActType_Relu6) { |
|
|
|
out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
} |
|
|
|
|
|
|
|
if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { |
|
|
|
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); |
|
|
|
if (oh1 < OH) { |
|
|
|
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); |
|
|
|
} // end if (oh1 < OH) |
|
|
|
} else { |
|
|
|
WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0); |
|
|
|
if (oh1 < OH) { |
|
|
|
WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow0), out_h1_w0_c0); |
|
|
|
} // end (oh1 < OH) |
|
|
|
} |
|
|
|
#undef PAD |
|
|
|
} |
|
|
|
|
|
|
|
__kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, |
|
|
|
const int4 input_shape, // N 36 H/4*W/4 CI_SLICES |
|
|
|
const int4 output_shape) { // N 36 H/4*W/4 CO_SLICES |
|
|
|
#define H 36 |
|
|
|
int w = get_global_id(0) * 2; |
|
|
|
int h = get_global_id(1); |
|
|
|
int co_slice = get_global_id(2) * 2; |
|
|
|
__kernel void Convolution_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, |
|
|
|
__global FLT4 *bias, const int4 input_shape, const int4 output_shape, |
|
|
|
const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { |
|
|
|
const int BlockH = 2; |
|
|
|
const int BlockW = 1; |
|
|
|
const int BlockC = 2; |
|
|
|
DEFINE_ARGS; |
|
|
|
|
|
|
|
const int oh0 = oh + 0; |
|
|
|
const int oh1 = oh + 1; |
|
|
|
const int n_oh0 = n * OH + oh0; |
|
|
|
const int n_oh1 = n * OH + oh1; |
|
|
|
const int ow0 = ow + 0; |
|
|
|
const int co_slice0 = co_slice + 0; |
|
|
|
const int co_slice1 = co_slice + 1; |
|
|
|
|
|
|
|
FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
FLT4 out_h0_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
FLT4 out_h1_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
|
|
|
|
__global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; |
|
|
|
|
|
|
|
int CI_SLICES = input_shape.w; |
|
|
|
int W = input_shape.z; |
|
|
|
int CO_SLICES = output_shape.w; |
|
|
|
for (int kh = 0; kh < KH; ++kh) { |
|
|
|
const int ih0 = kh * dilationH + oh0 * strideH - padTop; |
|
|
|
// no need to check oh1, finally write out will check (oh1 < OH) |
|
|
|
const int ih1 = kh * dilationH + oh1 * strideH - padTop; |
|
|
|
// check ih0 and ih1 |
|
|
|
const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; |
|
|
|
const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; |
|
|
|
|
|
|
|
if (h >= H || w >= W || co_slice >= CO_SLICES) { |
|
|
|
return; |
|
|
|
for (int kw = 0; kw < KW; ++kw) { |
|
|
|
const int iw0 = kw * dilationW + ow0 * strideW - padLeft; |
|
|
|
int x_idx0 = iw0 * CI_SLICES; |
|
|
|
|
|
|
|
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { |
|
|
|
FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0)); |
|
|
|
FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1)); |
|
|
|
x_idx0++; |
|
|
|
|
|
|
|
out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x; |
|
|
|
out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x; |
|
|
|
out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y; |
|
|
|
out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y; |
|
|
|
out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z; |
|
|
|
out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z; |
|
|
|
out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w; |
|
|
|
out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w; |
|
|
|
|
|
|
|
out_h0_w0_c1 += weight_ptr[4] * in_h0_w0.x; |
|
|
|
out_h1_w0_c1 += weight_ptr[4] * in_h1_w0.x; |
|
|
|
out_h0_w0_c1 += weight_ptr[5] * in_h0_w0.y; |
|
|
|
out_h1_w0_c1 += weight_ptr[5] * in_h1_w0.y; |
|
|
|
out_h0_w0_c1 += weight_ptr[6] * in_h0_w0.z; |
|
|
|
out_h1_w0_c1 += weight_ptr[6] * in_h1_w0.z; |
|
|
|
out_h0_w0_c1 += weight_ptr[7] * in_h0_w0.w; |
|
|
|
out_h1_w0_c1 += weight_ptr[7] * in_h1_w0.w; |
|
|
|
|
|
|
|
weight_ptr += 8; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
|
|
|
|
int y_idx = h; |
|
|
|
__global FLT16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2; |
|
|
|
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { |
|
|
|
FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(w + 0, y_idx)); |
|
|
|
FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(w + 1, y_idx)); |
|
|
|
y_idx += 36; |
|
|
|
|
|
|
|
FLT16 weight0 = weight_ptr[0], weight1 = weight_ptr[1]; |
|
|
|
weight_ptr += 2; |
|
|
|
|
|
|
|
out00 += in0.x * weight0.s0123; |
|
|
|
out00 += in0.y * weight0.s4567; |
|
|
|
out00 += in0.z * weight0.s89ab; |
|
|
|
out00 += in0.w * weight0.scdef; |
|
|
|
|
|
|
|
out01 += in1.x * weight0.s0123; |
|
|
|
out01 += in1.y * weight0.s4567; |
|
|
|
out01 += in1.z * weight0.s89ab; |
|
|
|
out01 += in1.w * weight0.scdef; |
|
|
|
|
|
|
|
out10 += in0.x * weight1.s0123; |
|
|
|
out10 += in0.y * weight1.s4567; |
|
|
|
out10 += in0.z * weight1.s89ab; |
|
|
|
out10 += in0.w * weight1.scdef; |
|
|
|
|
|
|
|
out11 += in1.x * weight1.s0123; |
|
|
|
out11 += in1.y * weight1.s4567; |
|
|
|
out11 += in1.z * weight1.s89ab; |
|
|
|
out11 += in1.w * weight1.scdef; |
|
|
|
if (bias) { |
|
|
|
out_h0_w0_c0 += bias[co_slice0]; |
|
|
|
out_h1_w0_c0 += bias[co_slice0]; |
|
|
|
out_h0_w0_c1 += bias[co_slice1]; |
|
|
|
out_h1_w0_c1 += bias[co_slice1]; |
|
|
|
} |
|
|
|
|
|
|
|
WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 0) * H + h), out00); |
|
|
|
if (w + 1 < W) { |
|
|
|
WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 0) * H + h), out01); |
|
|
|
if (act_type == ActType_Relu) { |
|
|
|
out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); |
|
|
|
out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); |
|
|
|
out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f)); |
|
|
|
out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f)); |
|
|
|
} else if (act_type == ActType_Relu6) { |
|
|
|
out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
} |
|
|
|
|
|
|
|
if (co_slice + 1 < CO_SLICES) { |
|
|
|
WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 1) * H + h), out10); |
|
|
|
if (w + 1 < W) { |
|
|
|
WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 1) * H + h), out11); |
|
|
|
} |
|
|
|
if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { |
|
|
|
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); |
|
|
|
if (oh1 < OH) { |
|
|
|
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); |
|
|
|
} // end if (oh1 < OH) |
|
|
|
if (co_slice1 < CO_SLICES) { |
|
|
|
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1); |
|
|
|
if (oh1 < OH) { |
|
|
|
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1); |
|
|
|
} // end if (oh1 < OH) |
|
|
|
} // end if (co_slice1 < CO_SLICES) |
|
|
|
} else { |
|
|
|
WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0); |
|
|
|
if (oh1 < OH) { |
|
|
|
WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow0), out_h1_w0_c0); |
|
|
|
} // end (oh1 < OH) |
|
|
|
if (co_slice1 < CO_SLICES) { |
|
|
|
WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice1, ow0), out_h0_w0_c1); |
|
|
|
if (oh1 < OH) { |
|
|
|
WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice1, ow0), out_h1_w0_c1); |
|
|
|
} // end if (oh1 < OH) |
|
|
|
} // end if (co_slice1 < CO_SLICES) |
|
|
|
} |
|
|
|
#undef H |
|
|
|
} |
|
|
|
|
|
|
|
constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f, |
|
|
|
0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f, |
|
|
|
0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f, |
|
|
|
0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f}; |
|
|
|
|
|
|
|
__kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias, |
|
|
|
const int4 input_shape, // N 36 H/4*W/4 CO_SLICES |
|
|
|
const int4 output_shape, // N H W CO_SLICES |
|
|
|
const int act_type) { |
|
|
|
int tile_xy = get_global_id(0); |
|
|
|
int row = get_global_id(1); |
|
|
|
int slice = get_global_id(2); |
|
|
|
|
|
|
|
int TILE_XY = input_shape.z; |
|
|
|
int SLICES = input_shape.w; |
|
|
|
int OW = output_shape.z; |
|
|
|
|
|
|
|
if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES) { |
|
|
|
return; |
|
|
|
} |
|
|
|
__kernel void Convolution_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, |
|
|
|
__global FLT4 *bias, const int4 input_shape, const int4 output_shape, |
|
|
|
const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { |
|
|
|
const int BlockH = 2; |
|
|
|
const int BlockW = 2; |
|
|
|
const int BlockC = 2; |
|
|
|
DEFINE_ARGS; |
|
|
|
|
|
|
|
const int oh0 = oh + 0; |
|
|
|
const int oh1 = oh + 1; |
|
|
|
const int n_oh0 = n * OH + oh0; |
|
|
|
const int n_oh1 = n * OH + oh1; |
|
|
|
const int ow0 = ow + 0; |
|
|
|
const int ow1 = ow + 1; |
|
|
|
const int co_slice0 = co_slice + 0; |
|
|
|
const int co_slice1 = co_slice + 1; |
|
|
|
|
|
|
|
FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
FLT4 out_h1_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
FLT4 out_h0_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
FLT4 out_h0_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
FLT4 out_h1_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
FLT4 out_h1_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
|
|
|
|
__global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; |
|
|
|
|
|
|
|
constant FLT *At_row = At + row * 6; |
|
|
|
FLT4 AtM_row[6] = {0}; |
|
|
|
for (int y = 0, idx = slice * 36; y < 6; y++) { |
|
|
|
for (int x = 0; x < 6; x++, idx++) { |
|
|
|
AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_xy, idx)); |
|
|
|
} |
|
|
|
} |
|
|
|
for (int kh = 0; kh < KH; ++kh) { |
|
|
|
const int ih0 = kh * dilationH + oh0 * strideH - padTop; |
|
|
|
// no need to check oh1, finally write out will check (oh1 < OH) |
|
|
|
const int ih1 = kh * dilationH + oh1 * strideH - padTop; |
|
|
|
// check ih0 and ih1 |
|
|
|
const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; |
|
|
|
const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; |
|
|
|
|
|
|
|
int TILE_X = UP_DIV(OW, 4); |
|
|
|
int tile_x = tile_xy % TILE_X; |
|
|
|
int tile_y = tile_xy / TILE_X; |
|
|
|
int oh = tile_y * 4 + row; |
|
|
|
int ow = tile_x * 4; |
|
|
|
int x_idx = ow * SLICES + slice; |
|
|
|
|
|
|
|
for (int x = 0, idx = 0; x < 4; x++) { |
|
|
|
FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); |
|
|
|
for (int y = 0; y < 6; y++, idx++) { |
|
|
|
acc += AtM_row[y] * At[idx]; |
|
|
|
for (int kw = 0; kw < KW; ++kw) { |
|
|
|
const int iw0 = kw * dilationW + ow0 * strideW - padLeft; |
|
|
|
int iw1 = (ow1 < OW) ? kw * dilationW + ow1 * strideW - padLeft : -2; |
|
|
|
int x_idx0 = iw0 * CI_SLICES; |
|
|
|
int x_idx1 = iw1 * CI_SLICES; |
|
|
|
|
|
|
|
for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { |
|
|
|
FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0)); |
|
|
|
FLT4 in_h0_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx0)); |
|
|
|
FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1)); |
|
|
|
FLT4 in_h1_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx1)); |
|
|
|
x_idx0++; |
|
|
|
x_idx1++; |
|
|
|
|
|
|
|
out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x; |
|
|
|
out_h0_w1_c0 += weight_ptr[0] * in_h0_w1.x; |
|
|
|
out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x; |
|
|
|
out_h1_w1_c0 += weight_ptr[0] * in_h1_w1.x; |
|
|
|
out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y; |
|
|
|
out_h0_w1_c0 += weight_ptr[1] * in_h0_w1.y; |
|
|
|
out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y; |
|
|
|
out_h1_w1_c0 += weight_ptr[1] * in_h1_w1.y; |
|
|
|
out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z; |
|
|
|
out_h0_w1_c0 += weight_ptr[2] * in_h0_w1.z; |
|
|
|
out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z; |
|
|
|
out_h1_w1_c0 += weight_ptr[2] * in_h1_w1.z; |
|
|
|
out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w; |
|
|
|
out_h0_w1_c0 += weight_ptr[3] * in_h0_w1.w; |
|
|
|
out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w; |
|
|
|
out_h1_w1_c0 += weight_ptr[3] * in_h1_w1.w; |
|
|
|
|
|
|
|
out_h0_w0_c1 += weight_ptr[4] * in_h0_w0.x; |
|
|
|
out_h0_w1_c1 += weight_ptr[4] * in_h0_w1.x; |
|
|
|
out_h1_w0_c1 += weight_ptr[4] * in_h1_w0.x; |
|
|
|
out_h1_w1_c1 += weight_ptr[4] * in_h1_w1.x; |
|
|
|
out_h0_w0_c1 += weight_ptr[5] * in_h0_w0.y; |
|
|
|
out_h0_w1_c1 += weight_ptr[5] * in_h0_w1.y; |
|
|
|
out_h1_w0_c1 += weight_ptr[5] * in_h1_w0.y; |
|
|
|
out_h1_w1_c1 += weight_ptr[5] * in_h1_w1.y; |
|
|
|
out_h0_w0_c1 += weight_ptr[6] * in_h0_w0.z; |
|
|
|
out_h0_w1_c1 += weight_ptr[6] * in_h0_w1.z; |
|
|
|
out_h1_w0_c1 += weight_ptr[6] * in_h1_w0.z; |
|
|
|
out_h1_w1_c1 += weight_ptr[6] * in_h1_w1.z; |
|
|
|
out_h0_w0_c1 += weight_ptr[7] * in_h0_w0.w; |
|
|
|
out_h0_w1_c1 += weight_ptr[7] * in_h0_w1.w; |
|
|
|
out_h1_w0_c1 += weight_ptr[7] * in_h1_w0.w; |
|
|
|
out_h1_w1_c1 += weight_ptr[7] * in_h1_w1.w; |
|
|
|
|
|
|
|
weight_ptr += 8; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if (bias != 0) { |
|
|
|
acc += bias[slice]; |
|
|
|
} |
|
|
|
if (bias) { |
|
|
|
out_h0_w0_c0 += bias[co_slice0]; |
|
|
|
out_h0_w1_c0 += bias[co_slice0]; |
|
|
|
out_h1_w0_c0 += bias[co_slice0]; |
|
|
|
out_h1_w1_c0 += bias[co_slice0]; |
|
|
|
out_h0_w0_c1 += bias[co_slice1]; |
|
|
|
out_h0_w1_c1 += bias[co_slice1]; |
|
|
|
out_h1_w0_c1 += bias[co_slice1]; |
|
|
|
out_h1_w1_c1 += bias[co_slice1]; |
|
|
|
} |
|
|
|
|
|
|
|
if (act_type == ActType_Relu) { |
|
|
|
acc = max(acc, (FLT4)(0.0f)); |
|
|
|
} else if (act_type == ActType_Relu6) { |
|
|
|
acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
} |
|
|
|
if (act_type == ActType_Relu) { |
|
|
|
out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); |
|
|
|
out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f)); |
|
|
|
out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); |
|
|
|
out_h1_w1_c0 = max(out_h1_w1_c0, (FLT4)(0.0f)); |
|
|
|
out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f)); |
|
|
|
out_h0_w1_c1 = max(out_h0_w1_c1, (FLT4)(0.0f)); |
|
|
|
out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f)); |
|
|
|
out_h1_w1_c1 = max(out_h1_w1_c1, (FLT4)(0.0f)); |
|
|
|
} else if (act_type == ActType_Relu6) { |
|
|
|
out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
out_h1_w1_c0 = clamp(out_h1_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
out_h0_w1_c1 = clamp(out_h0_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
out_h1_w1_c1 = clamp(out_h1_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f)); |
|
|
|
} |
|
|
|
|
|
|
|
WRITE_IMAGE(output, (int2)(x_idx, oh), acc); |
|
|
|
x_idx += SLICES; |
|
|
|
if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { |
|
|
|
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); |
|
|
|
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0); |
|
|
|
if (oh1 < OH) { |
|
|
|
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); |
|
|
|
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0); |
|
|
|
} // end if (oh1 < OH) |
|
|
|
if (co_slice1 < CO_SLICES) { |
|
|
|
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1); |
|
|
|
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1); |
|
|
|
if (oh1 < OH) { |
|
|
|
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1); |
|
|
|
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1); |
|
|
|
} // end if (oh1 < OH) |
|
|
|
} // end if (co_slice1 < CO_SLICES) |
|
|
|
} else { |
|
|
|
WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0); |
|
|
|
WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow1), out_h0_w1_c0); |
|
|
|
if (oh1 < OH) { |
|
|
|
WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow0), out_h1_w0_c0); |
|
|
|
WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow1), out_h1_w1_c0); |
|
|
|
} // end (oh1 < OH) |
|
|
|
if (co_slice1 < CO_SLICES) { |
|
|
|
WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice1, ow0), out_h0_w0_c1); |
|
|
|
WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice1, ow1), out_h0_w1_c1); |
|
|
|
if (oh1 < OH) { |
|
|
|
WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice1, ow0), out_h1_w0_c1); |
|
|
|
WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice1, ow1), out_h1_w1_c1); |
|
|
|
} // end if (oh1 < OH) |
|
|
|
} // end if (co_slice1 < CO_SLICES) |
|
|
|
} |
|
|
|
} |