Merge pull request !8153 from wangdongxu6/opencl_conv_optimize_performancetags/v1.1.0
| @@ -3,268 +3,420 @@ | |||||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | ||||
| #define CI_TILE 4 | #define CI_TILE 4 | ||||
| #define CO_TILE 4 | |||||
| #define MAX_IMAGE2D_SIZE 65535 | #define MAX_IMAGE2D_SIZE 65535 | ||||
| #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | ||||
| #define ActType_No 0 | |||||
| #define ActType_Relu 1 | #define ActType_Relu 1 | ||||
| #define ActType_Sigmod 2 | |||||
| #define ActType_Relu6 3 | #define ActType_Relu6 3 | ||||
| __kernel void Convolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, | |||||
| __global FLT4 *bias, const int4 input_shape, const int4 output_shape, | |||||
| const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { | |||||
| const int N = input_shape.x; | |||||
| const int IH = input_shape.y; | |||||
| const int IW = input_shape.z; | |||||
| const int CI_SLICES = input_shape.w; | |||||
| const int OH = output_shape.y; | |||||
| const int OW = output_shape.z; | |||||
| const int CO_SLICES = output_shape.w; | |||||
| const int KH = kernel_stride.x; | |||||
| const int KW = kernel_stride.y; | |||||
| const int strideH = kernel_stride.z; | |||||
| const int strideW = kernel_stride.w; | |||||
| const int padTop = pad.x; | |||||
| const int padLeft = pad.z; | |||||
| const int dilationH = dilation.x; | |||||
| const int dilationW = dilation.y; | |||||
| int n_oh = get_global_id(0); // [0, N*OH) | |||||
| int ow = get_global_id(1); // [0, OW) | |||||
| int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) ) | |||||
| int n; | |||||
| int oh; | |||||
| if (N == 1) { | |||||
| n = 0; | |||||
| oh = n_oh; | |||||
| } else { | |||||
| n = n_oh / OH; | |||||
| oh = n_oh % OH; | |||||
| } | |||||
| if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) { | |||||
| return; | |||||
| #define DEFINE_ARGS \ | |||||
| const int N = input_shape.x; \ | |||||
| const int IH = input_shape.y, IW = input_shape.z, CI_SLICES = input_shape.w; \ | |||||
| const int OH = output_shape.y, OW = output_shape.z, CO_SLICES = output_shape.w; \ | |||||
| const int KH = kernel_stride.x, KW = kernel_stride.y; \ | |||||
| const int strideH = kernel_stride.z, strideW = kernel_stride.w; \ | |||||
| const int padTop = pad.x, padBottom = pad.y, padLeft = pad.z, padRight = pad.w; \ | |||||
| const int dilationH = dilation.x, dilationW = dilation.y; \ | |||||
| \ | |||||
| const int n_oh = get_global_id(0); \ | |||||
| const int ow = get_global_id(1) * BlockW; \ | |||||
| const int co_slice = get_global_id(2) * BlockC; \ | |||||
| const int OH_SLICES = UP_DIV(OH, BlockH); \ | |||||
| const int n = n_oh / OH_SLICES; \ | |||||
| const int oh = (n_oh % OH_SLICES) * BlockH; \ | |||||
| if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) { \ | |||||
| return; \ | |||||
| } | } | ||||
| FLT4 out_c4 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| __global FLT4 *w_ic1_oc4 = weight + co_slice * KH * KW * CI_SLICES * CI_TILE; | |||||
| __kernel void Convolution_H1W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, | |||||
| __global FLT4 *bias, const int4 input_shape, const int4 output_shape, | |||||
| const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { | |||||
| const int BlockH = 1; | |||||
| const int BlockW = 1; | |||||
| const int BlockC = 1; | |||||
| DEFINE_ARGS; | |||||
| const int oh0 = oh + 0; | |||||
| const int n_oh0 = n * OH + oh0; | |||||
| const int ow0 = ow + 0; | |||||
| const int co_slice0 = co_slice + 0; | |||||
| FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; | |||||
| for (int kh = 0; kh < KH; ++kh) { | for (int kh = 0; kh < KH; ++kh) { | ||||
| int ih = kh * dilationH + oh * strideH - padTop; | |||||
| const int ih0 = kh * dilationH + oh0 * strideH - padTop; | |||||
| const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; | |||||
| for (int kw = 0; kw < KW; ++kw) { | for (int kw = 0; kw < KW; ++kw) { | ||||
| int iw = kw * dilationW + ow * strideW - padLeft; | |||||
| if (ih >= 0 && ih < IH && iw >= 0 && iw < IW) { | |||||
| for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { | |||||
| FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, n * IH + ih)); | |||||
| out_c4 += w_ic1_oc4[0] * in_c4.x; | |||||
| out_c4 += w_ic1_oc4[1] * in_c4.y; | |||||
| out_c4 += w_ic1_oc4[2] * in_c4.z; | |||||
| out_c4 += w_ic1_oc4[3] * in_c4.w; | |||||
| w_ic1_oc4 += 4; | |||||
| } | |||||
| } else { | |||||
| w_ic1_oc4 += 4 * CI_SLICES; | |||||
| const int iw0 = kw * dilationW + ow0 * strideW - padLeft; | |||||
| int x_idx0 = iw0 * CI_SLICES; | |||||
| for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { | |||||
| FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0)); | |||||
| x_idx0++; | |||||
| out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x; | |||||
| out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y; | |||||
| out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z; | |||||
| out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w; | |||||
| weight_ptr += 4; | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| if (bias != 0) { | |||||
| out_c4 = out_c4 + bias[co_slice]; | |||||
| if (bias) { | |||||
| out_h0_w0_c0 += bias[co_slice0]; | |||||
| } | } | ||||
| // activation | |||||
| if (act_type == ActType_Relu) { | if (act_type == ActType_Relu) { | ||||
| out_c4 = max(out_c4, (FLT4)(0.0f)); | |||||
| out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); | |||||
| } else if (act_type == ActType_Relu6) { | } else if (act_type == ActType_Relu6) { | ||||
| out_c4 = clamp(out_c4, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| } | } | ||||
| if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { | if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { | ||||
| WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, n_oh), out_c4); | |||||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); | |||||
| } else { | } else { | ||||
| WRITE_IMAGE(output, (int2)(n_oh * CO_SLICES + co_slice, ow), out_c4); | |||||
| WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0); | |||||
| } | } | ||||
| } | } | ||||
| constant FLT Bt[36] = { | |||||
| 1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f, | |||||
| 0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f, | |||||
| 0.0000000000f, -0.9428089857f, 1.3333334923f, 0.4714045525f, -0.6666667461f, 0.0000000000f, | |||||
| 0.0000000000f, -0.1178511307f, -0.0833333358f, 0.2357022613f, 0.1666666865f, 0.0000000000f, | |||||
| 0.0000000000f, 0.1178511307f, -0.0833333507f, -0.2357022911f, 0.1666666865f, 0.0000000000f, | |||||
| 0.0000000000f, 0.9999998808f, -0.0000000596f, -2.5000000000f, 0.0000000000f, 1.0000000000f, | |||||
| }; | |||||
| __kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output, | |||||
| const int4 input_shape, // N H W CI_SLICES | |||||
| const int4 output_shape) { // N 36 H/4*W/4 CI_SLICES | |||||
| #define PAD 1 | |||||
| int tile_xy = get_global_id(0); | |||||
| int row = get_global_id(1); | |||||
| int slice = get_global_id(2); | |||||
| int TILE_XY = output_shape.z; | |||||
| int SLICES = input_shape.w; | |||||
| if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES) { | |||||
| return; | |||||
| } | |||||
| __kernel void Convolution_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, | |||||
| __global FLT4 *bias, const int4 input_shape, const int4 output_shape, | |||||
| const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { | |||||
| const int BlockH = 2; | |||||
| const int BlockW = 1; | |||||
| const int BlockC = 1; | |||||
| DEFINE_ARGS; | |||||
| const int oh0 = oh + 0; | |||||
| const int oh1 = oh + 1; | |||||
| const int n_oh0 = n * OH + oh0; | |||||
| const int n_oh1 = n * OH + oh1; | |||||
| const int ow0 = ow + 0; | |||||
| const int co_slice0 = co_slice + 0; | |||||
| int IW = input_shape.z; | |||||
| int TILE_X = UP_DIV(IW, 4); | |||||
| int tile_x = tile_xy % TILE_X; | |||||
| int tile_y = tile_xy / TILE_X; | |||||
| constant FLT *Bt_row = Bt + row * 6; | |||||
| FLT4 BtD_row[6] = {0}; | |||||
| int ih = tile_y * 4 - PAD; | |||||
| int iw = tile_x * 4 - PAD; | |||||
| for (int y = 0; y < 6; y++) { | |||||
| int x_idx = iw * SLICES + slice; | |||||
| for (int x = 0; x < 6; x++) { | |||||
| // no need to check iw: because slice is in [0, SLICES). when iw<0, x_idx<0; iw>=IW, x_idx>=IW*SLICES | |||||
| // if (iw < 0 || iw >= IW) { continue; } | |||||
| BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, ih)); | |||||
| x_idx += SLICES; | |||||
| FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; | |||||
| for (int kh = 0; kh < KH; ++kh) { | |||||
| const int ih0 = kh * dilationH + oh0 * strideH - padTop; | |||||
| // no need to check oh1, finally write out will check (oh1 < OH) | |||||
| const int ih1 = kh * dilationH + oh1 * strideH - padTop; | |||||
| // check ih0 and ih1 | |||||
| const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; | |||||
| const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; | |||||
| for (int kw = 0; kw < KW; ++kw) { | |||||
| const int iw0 = kw * dilationW + ow0 * strideW - padLeft; | |||||
| int x_idx0 = iw0 * CI_SLICES; | |||||
| for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { | |||||
| FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0)); | |||||
| FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1)); | |||||
| x_idx0++; | |||||
| out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x; | |||||
| out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x; | |||||
| out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y; | |||||
| out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y; | |||||
| out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z; | |||||
| out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z; | |||||
| out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w; | |||||
| out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w; | |||||
| weight_ptr += 4; | |||||
| } | |||||
| } | } | ||||
| ih++; | |||||
| } | } | ||||
| int y_idx = slice * 36 + row * 6; | |||||
| for (int y = 0; y < 6; y++) { | |||||
| FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| for (int x = 0; x < 6; x++) { | |||||
| acc += BtD_row[x] * Bt[y * 6 + x]; | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)(tile_xy, y_idx + y), acc); // CH W H=36 | |||||
| if (bias) { | |||||
| out_h0_w0_c0 += bias[co_slice0]; | |||||
| out_h1_w0_c0 += bias[co_slice0]; | |||||
| } | |||||
| if (act_type == ActType_Relu) { | |||||
| out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); | |||||
| out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); | |||||
| } else if (act_type == ActType_Relu6) { | |||||
| out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| } | |||||
| if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { | |||||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); | |||||
| if (oh1 < OH) { | |||||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); | |||||
| } // end if (oh1 < OH) | |||||
| } else { | |||||
| WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0); | |||||
| if (oh1 < OH) { | |||||
| WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow0), out_h1_w0_c0); | |||||
| } // end (oh1 < OH) | |||||
| } | } | ||||
| #undef PAD | |||||
| } | } | ||||
| __kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, | |||||
| const int4 input_shape, // N 36 H/4*W/4 CI_SLICES | |||||
| const int4 output_shape) { // N 36 H/4*W/4 CO_SLICES | |||||
| #define H 36 | |||||
| int w = get_global_id(0) * 2; | |||||
| int h = get_global_id(1); | |||||
| int co_slice = get_global_id(2) * 2; | |||||
| __kernel void Convolution_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, | |||||
| __global FLT4 *bias, const int4 input_shape, const int4 output_shape, | |||||
| const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { | |||||
| const int BlockH = 2; | |||||
| const int BlockW = 1; | |||||
| const int BlockC = 2; | |||||
| DEFINE_ARGS; | |||||
| const int oh0 = oh + 0; | |||||
| const int oh1 = oh + 1; | |||||
| const int n_oh0 = n * OH + oh0; | |||||
| const int n_oh1 = n * OH + oh1; | |||||
| const int ow0 = ow + 0; | |||||
| const int co_slice0 = co_slice + 0; | |||||
| const int co_slice1 = co_slice + 1; | |||||
| FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out_h0_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out_h1_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; | |||||
| int CI_SLICES = input_shape.w; | |||||
| int W = input_shape.z; | |||||
| int CO_SLICES = output_shape.w; | |||||
| for (int kh = 0; kh < KH; ++kh) { | |||||
| const int ih0 = kh * dilationH + oh0 * strideH - padTop; | |||||
| // no need to check oh1, finally write out will check (oh1 < OH) | |||||
| const int ih1 = kh * dilationH + oh1 * strideH - padTop; | |||||
| // check ih0 and ih1 | |||||
| const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; | |||||
| const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; | |||||
| if (h >= H || w >= W || co_slice >= CO_SLICES) { | |||||
| return; | |||||
| for (int kw = 0; kw < KW; ++kw) { | |||||
| const int iw0 = kw * dilationW + ow0 * strideW - padLeft; | |||||
| int x_idx0 = iw0 * CI_SLICES; | |||||
| for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { | |||||
| FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0)); | |||||
| FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1)); | |||||
| x_idx0++; | |||||
| out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x; | |||||
| out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x; | |||||
| out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y; | |||||
| out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y; | |||||
| out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z; | |||||
| out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z; | |||||
| out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w; | |||||
| out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w; | |||||
| out_h0_w0_c1 += weight_ptr[4] * in_h0_w0.x; | |||||
| out_h1_w0_c1 += weight_ptr[4] * in_h1_w0.x; | |||||
| out_h0_w0_c1 += weight_ptr[5] * in_h0_w0.y; | |||||
| out_h1_w0_c1 += weight_ptr[5] * in_h1_w0.y; | |||||
| out_h0_w0_c1 += weight_ptr[6] * in_h0_w0.z; | |||||
| out_h1_w0_c1 += weight_ptr[6] * in_h1_w0.z; | |||||
| out_h0_w0_c1 += weight_ptr[7] * in_h0_w0.w; | |||||
| out_h1_w0_c1 += weight_ptr[7] * in_h1_w0.w; | |||||
| weight_ptr += 8; | |||||
| } | |||||
| } | |||||
| } | } | ||||
| FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| int y_idx = h; | |||||
| __global FLT16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2; | |||||
| for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { | |||||
| FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(w + 0, y_idx)); | |||||
| FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(w + 1, y_idx)); | |||||
| y_idx += 36; | |||||
| FLT16 weight0 = weight_ptr[0], weight1 = weight_ptr[1]; | |||||
| weight_ptr += 2; | |||||
| out00 += in0.x * weight0.s0123; | |||||
| out00 += in0.y * weight0.s4567; | |||||
| out00 += in0.z * weight0.s89ab; | |||||
| out00 += in0.w * weight0.scdef; | |||||
| out01 += in1.x * weight0.s0123; | |||||
| out01 += in1.y * weight0.s4567; | |||||
| out01 += in1.z * weight0.s89ab; | |||||
| out01 += in1.w * weight0.scdef; | |||||
| out10 += in0.x * weight1.s0123; | |||||
| out10 += in0.y * weight1.s4567; | |||||
| out10 += in0.z * weight1.s89ab; | |||||
| out10 += in0.w * weight1.scdef; | |||||
| out11 += in1.x * weight1.s0123; | |||||
| out11 += in1.y * weight1.s4567; | |||||
| out11 += in1.z * weight1.s89ab; | |||||
| out11 += in1.w * weight1.scdef; | |||||
| if (bias) { | |||||
| out_h0_w0_c0 += bias[co_slice0]; | |||||
| out_h1_w0_c0 += bias[co_slice0]; | |||||
| out_h0_w0_c1 += bias[co_slice1]; | |||||
| out_h1_w0_c1 += bias[co_slice1]; | |||||
| } | } | ||||
| WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 0) * H + h), out00); | |||||
| if (w + 1 < W) { | |||||
| WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 0) * H + h), out01); | |||||
| if (act_type == ActType_Relu) { | |||||
| out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); | |||||
| out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); | |||||
| out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f)); | |||||
| out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f)); | |||||
| } else if (act_type == ActType_Relu6) { | |||||
| out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| } | } | ||||
| if (co_slice + 1 < CO_SLICES) { | |||||
| WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 1) * H + h), out10); | |||||
| if (w + 1 < W) { | |||||
| WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 1) * H + h), out11); | |||||
| } | |||||
| if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { | |||||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); | |||||
| if (oh1 < OH) { | |||||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); | |||||
| } // end if (oh1 < OH) | |||||
| if (co_slice1 < CO_SLICES) { | |||||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1); | |||||
| if (oh1 < OH) { | |||||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1); | |||||
| } // end if (oh1 < OH) | |||||
| } // end if (co_slice1 < CO_SLICES) | |||||
| } else { | |||||
| WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0); | |||||
| if (oh1 < OH) { | |||||
| WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow0), out_h1_w0_c0); | |||||
| } // end (oh1 < OH) | |||||
| if (co_slice1 < CO_SLICES) { | |||||
| WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice1, ow0), out_h0_w0_c1); | |||||
| if (oh1 < OH) { | |||||
| WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice1, ow0), out_h1_w0_c1); | |||||
| } // end if (oh1 < OH) | |||||
| } // end if (co_slice1 < CO_SLICES) | |||||
| } | } | ||||
| #undef H | |||||
| } | } | ||||
| constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f, | |||||
| 0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f, | |||||
| 0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f, | |||||
| 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f}; | |||||
| __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias, | |||||
| const int4 input_shape, // N 36 H/4*W/4 CO_SLICES | |||||
| const int4 output_shape, // N H W CO_SLICES | |||||
| const int act_type) { | |||||
| int tile_xy = get_global_id(0); | |||||
| int row = get_global_id(1); | |||||
| int slice = get_global_id(2); | |||||
| int TILE_XY = input_shape.z; | |||||
| int SLICES = input_shape.w; | |||||
| int OW = output_shape.z; | |||||
| if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES) { | |||||
| return; | |||||
| } | |||||
| __kernel void Convolution_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, | |||||
| __global FLT4 *bias, const int4 input_shape, const int4 output_shape, | |||||
| const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { | |||||
| const int BlockH = 2; | |||||
| const int BlockW = 2; | |||||
| const int BlockC = 2; | |||||
| DEFINE_ARGS; | |||||
| const int oh0 = oh + 0; | |||||
| const int oh1 = oh + 1; | |||||
| const int n_oh0 = n * OH + oh0; | |||||
| const int n_oh1 = n * OH + oh1; | |||||
| const int ow0 = ow + 0; | |||||
| const int ow1 = ow + 1; | |||||
| const int co_slice0 = co_slice + 0; | |||||
| const int co_slice1 = co_slice + 1; | |||||
| FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out_h1_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out_h0_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out_h0_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out_h1_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out_h1_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; | |||||
| constant FLT *At_row = At + row * 6; | |||||
| FLT4 AtM_row[6] = {0}; | |||||
| for (int y = 0, idx = slice * 36; y < 6; y++) { | |||||
| for (int x = 0; x < 6; x++, idx++) { | |||||
| AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_xy, idx)); | |||||
| } | |||||
| } | |||||
| for (int kh = 0; kh < KH; ++kh) { | |||||
| const int ih0 = kh * dilationH + oh0 * strideH - padTop; | |||||
| // no need to check oh1, finally write out will check (oh1 < OH) | |||||
| const int ih1 = kh * dilationH + oh1 * strideH - padTop; | |||||
| // check ih0 and ih1 | |||||
| const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; | |||||
| const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; | |||||
| int TILE_X = UP_DIV(OW, 4); | |||||
| int tile_x = tile_xy % TILE_X; | |||||
| int tile_y = tile_xy / TILE_X; | |||||
| int oh = tile_y * 4 + row; | |||||
| int ow = tile_x * 4; | |||||
| int x_idx = ow * SLICES + slice; | |||||
| for (int x = 0, idx = 0; x < 4; x++) { | |||||
| FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| for (int y = 0; y < 6; y++, idx++) { | |||||
| acc += AtM_row[y] * At[idx]; | |||||
| for (int kw = 0; kw < KW; ++kw) { | |||||
| const int iw0 = kw * dilationW + ow0 * strideW - padLeft; | |||||
| int iw1 = (ow1 < OW) ? kw * dilationW + ow1 * strideW - padLeft : -2; | |||||
| int x_idx0 = iw0 * CI_SLICES; | |||||
| int x_idx1 = iw1 * CI_SLICES; | |||||
| for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { | |||||
| FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0)); | |||||
| FLT4 in_h0_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx0)); | |||||
| FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1)); | |||||
| FLT4 in_h1_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx1)); | |||||
| x_idx0++; | |||||
| x_idx1++; | |||||
| out_h0_w0_c0 += weight_ptr[0] * in_h0_w0.x; | |||||
| out_h0_w1_c0 += weight_ptr[0] * in_h0_w1.x; | |||||
| out_h1_w0_c0 += weight_ptr[0] * in_h1_w0.x; | |||||
| out_h1_w1_c0 += weight_ptr[0] * in_h1_w1.x; | |||||
| out_h0_w0_c0 += weight_ptr[1] * in_h0_w0.y; | |||||
| out_h0_w1_c0 += weight_ptr[1] * in_h0_w1.y; | |||||
| out_h1_w0_c0 += weight_ptr[1] * in_h1_w0.y; | |||||
| out_h1_w1_c0 += weight_ptr[1] * in_h1_w1.y; | |||||
| out_h0_w0_c0 += weight_ptr[2] * in_h0_w0.z; | |||||
| out_h0_w1_c0 += weight_ptr[2] * in_h0_w1.z; | |||||
| out_h1_w0_c0 += weight_ptr[2] * in_h1_w0.z; | |||||
| out_h1_w1_c0 += weight_ptr[2] * in_h1_w1.z; | |||||
| out_h0_w0_c0 += weight_ptr[3] * in_h0_w0.w; | |||||
| out_h0_w1_c0 += weight_ptr[3] * in_h0_w1.w; | |||||
| out_h1_w0_c0 += weight_ptr[3] * in_h1_w0.w; | |||||
| out_h1_w1_c0 += weight_ptr[3] * in_h1_w1.w; | |||||
| out_h0_w0_c1 += weight_ptr[4] * in_h0_w0.x; | |||||
| out_h0_w1_c1 += weight_ptr[4] * in_h0_w1.x; | |||||
| out_h1_w0_c1 += weight_ptr[4] * in_h1_w0.x; | |||||
| out_h1_w1_c1 += weight_ptr[4] * in_h1_w1.x; | |||||
| out_h0_w0_c1 += weight_ptr[5] * in_h0_w0.y; | |||||
| out_h0_w1_c1 += weight_ptr[5] * in_h0_w1.y; | |||||
| out_h1_w0_c1 += weight_ptr[5] * in_h1_w0.y; | |||||
| out_h1_w1_c1 += weight_ptr[5] * in_h1_w1.y; | |||||
| out_h0_w0_c1 += weight_ptr[6] * in_h0_w0.z; | |||||
| out_h0_w1_c1 += weight_ptr[6] * in_h0_w1.z; | |||||
| out_h1_w0_c1 += weight_ptr[6] * in_h1_w0.z; | |||||
| out_h1_w1_c1 += weight_ptr[6] * in_h1_w1.z; | |||||
| out_h0_w0_c1 += weight_ptr[7] * in_h0_w0.w; | |||||
| out_h0_w1_c1 += weight_ptr[7] * in_h0_w1.w; | |||||
| out_h1_w0_c1 += weight_ptr[7] * in_h1_w0.w; | |||||
| out_h1_w1_c1 += weight_ptr[7] * in_h1_w1.w; | |||||
| weight_ptr += 8; | |||||
| } | |||||
| } | } | ||||
| } | |||||
| if (bias != 0) { | |||||
| acc += bias[slice]; | |||||
| } | |||||
| if (bias) { | |||||
| out_h0_w0_c0 += bias[co_slice0]; | |||||
| out_h0_w1_c0 += bias[co_slice0]; | |||||
| out_h1_w0_c0 += bias[co_slice0]; | |||||
| out_h1_w1_c0 += bias[co_slice0]; | |||||
| out_h0_w0_c1 += bias[co_slice1]; | |||||
| out_h0_w1_c1 += bias[co_slice1]; | |||||
| out_h1_w0_c1 += bias[co_slice1]; | |||||
| out_h1_w1_c1 += bias[co_slice1]; | |||||
| } | |||||
| if (act_type == ActType_Relu) { | |||||
| acc = max(acc, (FLT4)(0.0f)); | |||||
| } else if (act_type == ActType_Relu6) { | |||||
| acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| } | |||||
| if (act_type == ActType_Relu) { | |||||
| out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); | |||||
| out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f)); | |||||
| out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); | |||||
| out_h1_w1_c0 = max(out_h1_w1_c0, (FLT4)(0.0f)); | |||||
| out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f)); | |||||
| out_h0_w1_c1 = max(out_h0_w1_c1, (FLT4)(0.0f)); | |||||
| out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f)); | |||||
| out_h1_w1_c1 = max(out_h1_w1_c1, (FLT4)(0.0f)); | |||||
| } else if (act_type == ActType_Relu6) { | |||||
| out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| out_h1_w1_c0 = clamp(out_h1_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| out_h0_w1_c1 = clamp(out_h0_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| out_h1_w1_c1 = clamp(out_h1_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)(x_idx, oh), acc); | |||||
| x_idx += SLICES; | |||||
| if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { | |||||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); | |||||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0); | |||||
| if (oh1 < OH) { | |||||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); | |||||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0); | |||||
| } // end if (oh1 < OH) | |||||
| if (co_slice1 < CO_SLICES) { | |||||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1); | |||||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1); | |||||
| if (oh1 < OH) { | |||||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1); | |||||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1); | |||||
| } // end if (oh1 < OH) | |||||
| } // end if (co_slice1 < CO_SLICES) | |||||
| } else { | |||||
| WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0); | |||||
| WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow1), out_h0_w1_c0); | |||||
| if (oh1 < OH) { | |||||
| WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow0), out_h1_w0_c0); | |||||
| WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow1), out_h1_w1_c0); | |||||
| } // end (oh1 < OH) | |||||
| if (co_slice1 < CO_SLICES) { | |||||
| WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice1, ow0), out_h0_w0_c1); | |||||
| WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice1, ow1), out_h0_w1_c1); | |||||
| if (oh1 < OH) { | |||||
| WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice1, ow0), out_h1_w0_c1); | |||||
| WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice1, ow1), out_h1_w1_c1); | |||||
| } // end if (oh1 < OH) | |||||
| } // end if (co_slice1 < CO_SLICES) | |||||
| } | } | ||||
| } | } | ||||
| @@ -0,0 +1,187 @@ | |||||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||||
| #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | |||||
| #define ActType_Relu 1 | |||||
| #define ActType_Relu6 3 | |||||
| constant FLT Bt[36] = { | |||||
| 1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f, | |||||
| 0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f, | |||||
| 0.0000000000f, -0.9428089857f, 1.3333334923f, 0.4714045525f, -0.6666667461f, 0.0000000000f, | |||||
| 0.0000000000f, -0.1178511307f, -0.0833333358f, 0.2357022613f, 0.1666666865f, 0.0000000000f, | |||||
| 0.0000000000f, 0.1178511307f, -0.0833333507f, -0.2357022911f, 0.1666666865f, 0.0000000000f, | |||||
| 0.0000000000f, 0.9999998808f, -0.0000000596f, -2.5000000000f, 0.0000000000f, 1.0000000000f, | |||||
| }; | |||||
| __kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output, | |||||
| const int4 input_shape, // N H W CI_SLICES | |||||
| const int4 output_shape) { // N 36 H/4*W/4 CI_SLICES | |||||
| #define PAD 1 | |||||
| int tile_xy = get_global_id(0); | |||||
| int row = get_global_id(1); | |||||
| int slice = get_global_id(2); | |||||
| int TILE_XY = output_shape.z; | |||||
| int SLICES = input_shape.w; | |||||
| if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES) { | |||||
| return; | |||||
| } | |||||
| int IH = input_shape.y, IW = input_shape.z; | |||||
| int TILE_X = UP_DIV(IW, 4); | |||||
| int tile_x = tile_xy % TILE_X; | |||||
| int tile_y = tile_xy / TILE_X; | |||||
| constant FLT *Bt_row = Bt + row * 6; | |||||
| FLT4 BtD_row[6] = {0}; | |||||
| int ih = tile_y * 4 - PAD; | |||||
| int iw = tile_x * 4 - PAD; | |||||
| for (int y = 0; y < 6; y++) { | |||||
| int x_idx = iw * SLICES + slice; | |||||
| for (int x = 0; x < 6; x++) { | |||||
| // no need to check iw: because slice is in [0, SLICES). when iw<0, x_idx<0; iw>=IW, x_idx>=IW*SLICES | |||||
| // if (iw < 0 || iw >= IW) { continue; } | |||||
| BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, ih)); | |||||
| x_idx += SLICES; | |||||
| } | |||||
| ih++; | |||||
| } | |||||
| int y_idx = slice * 36 + row * 6; | |||||
| for (int y = 0; y < 6; y++) { | |||||
| FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| for (int x = 0; x < 6; x++) { | |||||
| acc += BtD_row[x] * Bt[y * 6 + x]; | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)(tile_xy, y_idx + y), acc); // CH W H=36 | |||||
| } | |||||
| #undef PAD | |||||
| } | |||||
| __kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, | |||||
| const int4 input_shape, // N 36 H/4*W/4 CI_SLICES | |||||
| const int4 output_shape) { // N 36 H/4*W/4 CO_SLICES | |||||
| #define H 36 | |||||
| int w = get_global_id(0) * 2; | |||||
| int h = get_global_id(1); | |||||
| int co_slice = get_global_id(2) * 2; | |||||
| int CI_SLICES = input_shape.w; | |||||
| int W = input_shape.z; | |||||
| int CO_SLICES = output_shape.w; | |||||
| if (h >= H || w >= W || co_slice >= CO_SLICES) { | |||||
| return; | |||||
| } | |||||
| FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| int y_idx = h; | |||||
| __global FLT16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2; | |||||
| for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { | |||||
| FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(w + 0, y_idx)); | |||||
| FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(w + 1, y_idx)); | |||||
| y_idx += 36; | |||||
| FLT16 weight0 = weight_ptr[0], weight1 = weight_ptr[1]; | |||||
| weight_ptr += 2; | |||||
| out00 += in0.x * weight0.s0123; | |||||
| out00 += in0.y * weight0.s4567; | |||||
| out00 += in0.z * weight0.s89ab; | |||||
| out00 += in0.w * weight0.scdef; | |||||
| out01 += in1.x * weight0.s0123; | |||||
| out01 += in1.y * weight0.s4567; | |||||
| out01 += in1.z * weight0.s89ab; | |||||
| out01 += in1.w * weight0.scdef; | |||||
| out10 += in0.x * weight1.s0123; | |||||
| out10 += in0.y * weight1.s4567; | |||||
| out10 += in0.z * weight1.s89ab; | |||||
| out10 += in0.w * weight1.scdef; | |||||
| out11 += in1.x * weight1.s0123; | |||||
| out11 += in1.y * weight1.s4567; | |||||
| out11 += in1.z * weight1.s89ab; | |||||
| out11 += in1.w * weight1.scdef; | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 0) * H + h), out00); | |||||
| if (w + 1 < W) { | |||||
| WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 0) * H + h), out01); | |||||
| } | |||||
| if (co_slice + 1 < CO_SLICES) { | |||||
| WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 1) * H + h), out10); | |||||
| if (w + 1 < W) { | |||||
| WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 1) * H + h), out11); | |||||
| } | |||||
| } | |||||
| #undef H | |||||
| } | |||||
| constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f, | |||||
| 0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f, | |||||
| 0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f, | |||||
| 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f}; | |||||
| __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias, | |||||
| const int4 input_shape, // N 36 H/4*W/4 CO_SLICES | |||||
| const int4 output_shape, // N H W CO_SLICES | |||||
| const int act_type) { | |||||
| int tile_xy = get_global_id(0); | |||||
| int row = get_global_id(1); | |||||
| int slice = get_global_id(2); | |||||
| int TILE_XY = input_shape.z; | |||||
| int SLICES = input_shape.w; | |||||
| int OH = output_shape.y; | |||||
| int OW = output_shape.z; | |||||
| if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES) { | |||||
| return; | |||||
| } | |||||
| constant FLT *At_row = At + row * 6; | |||||
| FLT4 AtM_row[6] = {0}; | |||||
| for (int y = 0, idx = slice * 36; y < 6; y++) { | |||||
| for (int x = 0; x < 6; x++, idx++) { | |||||
| AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_xy, idx)); | |||||
| } | |||||
| } | |||||
| int TILE_X = UP_DIV(OW, 4); | |||||
| int tile_x = tile_xy % TILE_X; | |||||
| int tile_y = tile_xy / TILE_X; | |||||
| int oh = tile_y * 4 + row; | |||||
| int ow = tile_x * 4; | |||||
| int x_idx = ow * SLICES + slice; | |||||
| for (int x = 0, idx = 0; x < 4; x++) { | |||||
| FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||||
| for (int y = 0; y < 6; y++, idx++) { | |||||
| acc += AtM_row[y] * At[idx]; | |||||
| } | |||||
| if (bias) { | |||||
| acc += bias[slice]; | |||||
| } | |||||
| if (act_type == ActType_Relu) { | |||||
| acc = max(acc, (FLT4)(0.0f)); | |||||
| } else if (act_type == ActType_Relu6) { | |||||
| acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)(x_idx, oh), acc); | |||||
| x_idx += SLICES; | |||||
| } | |||||
| } | |||||
| @@ -24,6 +24,7 @@ | |||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| #include "src/runtime/kernel/opencl/cl/convolution.cl.inc" | #include "src/runtime/kernel/opencl/cl/convolution.cl.inc" | ||||
| #include "src/runtime/kernel/opencl/cl/winograd.cl.inc" | |||||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | using mindspore::kernel::KERNEL_ARCH::kGPU; | ||||
| using mindspore::lite::KernelRegistrar; | using mindspore::lite::KernelRegistrar; | ||||
| @@ -38,14 +39,11 @@ constexpr size_t CI_TILE = C4NUM; | |||||
| constexpr size_t CO_TILE = C4NUM; | constexpr size_t CO_TILE = C4NUM; | ||||
| int ConvolutionOpenCLKernel::Init() { | int ConvolutionOpenCLKernel::Init() { | ||||
| auto allocator = ocl_runtime_->GetAllocator(); | |||||
| auto param = reinterpret_cast<ConvParameter *>(op_parameter_); | |||||
| std::set<std::string> build_options; | |||||
| use_fp16_ = ocl_runtime_->GetFp16Enable(); | use_fp16_ = ocl_runtime_->GetFp16Enable(); | ||||
| sizeof_FLT_ = use_fp16_ ? sizeof(float16_t) : sizeof(float); | |||||
| auto input_tensor = in_tensors_[0]; | auto input_tensor = in_tensors_[0]; | ||||
| auto output_tensor = out_tensors_[0]; | auto output_tensor = out_tensors_[0]; | ||||
| batch_size_ = input_tensor->Batch(); | batch_size_ = input_tensor->Batch(); | ||||
| CI_ = input_tensor->Channel(); | CI_ = input_tensor->Channel(); | ||||
| IH_ = input_tensor->Height(); | IH_ = input_tensor->Height(); | ||||
| @@ -55,8 +53,8 @@ int ConvolutionOpenCLKernel::Init() { | |||||
| OW_ = output_tensor->Width(); | OW_ = output_tensor->Width(); | ||||
| CI_SLICES_ = UP_DIV(CI_, C4NUM); | CI_SLICES_ = UP_DIV(CI_, C4NUM); | ||||
| CO_SLICES_ = UP_DIV(CO_, C4NUM); | CO_SLICES_ = UP_DIV(CO_, C4NUM); | ||||
| KH_ = param->kernel_h_; | |||||
| KW_ = param->kernel_w_; | |||||
| KH_ = param_->kernel_h_; | |||||
| KW_ = param_->kernel_w_; | |||||
| has_bias_ = in_tensors_.size() == 3; | has_bias_ = in_tensors_.size() == 3; | ||||
| // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true | // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true | ||||
| @@ -65,28 +63,39 @@ int ConvolutionOpenCLKernel::Init() { | |||||
| TILES_XY_ = TILES_X_ * TILES_Y_; | TILES_XY_ = TILES_X_ * TILES_Y_; | ||||
| use_winograd_ = UseWinograd4x4To6x6(); | use_winograd_ = UseWinograd4x4To6x6(); | ||||
| if (!use_winograd_) { | |||||
| SetBlockSize(); | |||||
| SetGlobalLocal(); | |||||
| } | |||||
| // build kernel | // build kernel | ||||
| std::string program_name = "Convolution"; | |||||
| ocl_runtime_->LoadSource(program_name, convolution_source); | |||||
| std::set<std::string> build_options; | |||||
| if (use_winograd_) { | if (use_winograd_) { | ||||
| MS_LOG(DEBUG) << "use winograd"; | MS_LOG(DEBUG) << "use winograd"; | ||||
| std::string program_name = "Winograd"; | |||||
| ocl_runtime_->LoadSource(program_name, winograd_source); | |||||
| ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options); | ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options); | ||||
| ocl_runtime_->BuildKernel(kernel_conv_, program_name, "WinogradConvolution", build_options); | ocl_runtime_->BuildKernel(kernel_conv_, program_name, "WinogradConvolution", build_options); | ||||
| ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options); | ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options); | ||||
| } else { | } else { | ||||
| ocl_runtime_->BuildKernel(kernel_conv_, program_name, "Convolution", build_options); | |||||
| std::string program_name = "Convolution"; | |||||
| std::string kernel_name = "Convolution_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + | |||||
| "C" + std::to_string(block_size_.C); | |||||
| ocl_runtime_->LoadSource("Convolution", convolution_source); | |||||
| ocl_runtime_->BuildKernel(kernel_conv_, program_name, kernel_name, build_options); | |||||
| } | } | ||||
| // allocate winograd memory | // allocate winograd memory | ||||
| if (use_winograd_) { | if (use_winograd_) { | ||||
| auto allocator = ocl_runtime_->GetAllocator(); | |||||
| size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; | size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; | ||||
| size_t size = TILES_XY_ * CI_SLICES_ * 36 * sizeof_FLT(); | |||||
| size_t size = TILES_XY_ * CI_SLICES_ * 36 * sizeof_FLT_; | |||||
| size_t width = TILES_XY_; | size_t width = TILES_XY_; | ||||
| size_t height = CI_SLICES_ * 36; | size_t height = CI_SLICES_ * 36; | ||||
| winograd_mem0_ = allocator->Malloc(size, {width, height, img_dtype}); | winograd_mem0_ = allocator->Malloc(size, {width, height, img_dtype}); | ||||
| size = TILES_XY_ * CO_SLICES_ * 36 * sizeof_FLT(); | |||||
| size = TILES_XY_ * CO_SLICES_ * 36 * sizeof_FLT_; | |||||
| width = TILES_XY_; | width = TILES_XY_; | ||||
| height = CO_SLICES_ * 36; | height = CO_SLICES_ * 36; | ||||
| winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype}); | winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype}); | ||||
| @@ -156,9 +165,9 @@ int ConvolutionOpenCLKernel::InitWeight() { | |||||
| // allocate memory | // allocate memory | ||||
| size_t packed_weight_size; | size_t packed_weight_size; | ||||
| if (use_winograd_) { | if (use_winograd_) { | ||||
| packed_weight_size = UP_DIV(CO_, 8) * 6 * 6 * CI_SLICES_ * 2 * CI_TILE * CO_TILE * sizeof_FLT(); | |||||
| packed_weight_size = UP_DIV(CO_, 8) * 6 * 6 * CI_SLICES_ * 2 * CI_TILE * CO_TILE * sizeof_FLT_; | |||||
| } else { | } else { | ||||
| packed_weight_size = CO_SLICES_ * KH_ * KW_ * CI_SLICES_ * CI_TILE * CO_TILE * sizeof_FLT(); | |||||
| packed_weight_size = UP_ROUND(CO_SLICES_, block_size_.C) * KH_ * KW_ * CI_SLICES_ * CI_TILE * CO_TILE * sizeof_FLT_; | |||||
| } | } | ||||
| packed_weight_ = allocator->Malloc(packed_weight_size); | packed_weight_ = allocator->Malloc(packed_weight_size); | ||||
| allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true); | allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true); | ||||
| @@ -171,15 +180,19 @@ int ConvolutionOpenCLKernel::InitWeight() { | |||||
| auto weight_tensor = in_tensors_[1]; | auto weight_tensor = in_tensors_[1]; | ||||
| if (weight_tensor->data_type() == kNumberTypeFloat16) { | if (weight_tensor->data_type() == kNumberTypeFloat16) { | ||||
| if (use_fp16_) { | if (use_fp16_) { | ||||
| ConvertConvWeight4DTo7D<float16_t, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); | |||||
| ConvertConvWeight4DTo7D<float16_t, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_, | |||||
| block_size_.C); | |||||
| } else { | } else { | ||||
| ConvertConvWeight4DTo7D<float16_t, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); | |||||
| ConvertConvWeight4DTo7D<float16_t, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_, | |||||
| block_size_.C); | |||||
| } | } | ||||
| } else { | } else { | ||||
| if (use_fp16_) { | if (use_fp16_) { | ||||
| ConvertConvWeight4DTo7D<float, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); | |||||
| ConvertConvWeight4DTo7D<float, float16_t>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_, | |||||
| block_size_.C); | |||||
| } else { | } else { | ||||
| ConvertConvWeight4DTo7D<float, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_); | |||||
| ConvertConvWeight4DTo7D<float, float>(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_, | |||||
| block_size_.C); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -193,14 +206,14 @@ int ConvolutionOpenCLKernel::InitBias() { | |||||
| // align bias from C to C4 | // align bias from C to C4 | ||||
| auto bias_tensor = in_tensors_[2]; | auto bias_tensor = in_tensors_[2]; | ||||
| size_t packed_bias_size = CO_SLICES_ * CO_TILE * sizeof_FLT(); | |||||
| size_t packed_bias_size = UP_ROUND(CO_SLICES_, block_size_.C) * CO_TILE * sizeof_FLT_; | |||||
| packed_bias_ = allocator->Malloc(packed_bias_size); | packed_bias_ = allocator->Malloc(packed_bias_size); | ||||
| allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true); | allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true); | ||||
| memset(packed_bias_, 0x00, packed_bias_size); | memset(packed_bias_, 0x00, packed_bias_size); | ||||
| if (bias_tensor->data_type() == kNumberTypeFloat16) { | if (bias_tensor->data_type() == kNumberTypeFloat16) { | ||||
| if (use_fp16_) { | if (use_fp16_) { | ||||
| memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT()); | |||||
| memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT_); | |||||
| } else { | } else { | ||||
| auto packed_bias_fp32 = reinterpret_cast<float *>(packed_bias_); | auto packed_bias_fp32 = reinterpret_cast<float *>(packed_bias_); | ||||
| auto origin_bias_fp16 = reinterpret_cast<float16_t *>(bias_tensor->data_c()); | auto origin_bias_fp16 = reinterpret_cast<float16_t *>(bias_tensor->data_c()); | ||||
| @@ -216,7 +229,7 @@ int ConvolutionOpenCLKernel::InitBias() { | |||||
| packed_bias_fp16[i] = static_cast<float16_t>(origin_bias_fp32[i]); | packed_bias_fp16[i] = static_cast<float16_t>(origin_bias_fp32[i]); | ||||
| } | } | ||||
| } else { | } else { | ||||
| memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT()); | |||||
| memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT_); | |||||
| } | } | ||||
| } | } | ||||
| allocator->UnmapBuffer(packed_bias_); | allocator->UnmapBuffer(packed_bias_); | ||||
| @@ -231,6 +244,66 @@ int ConvolutionOpenCLKernel::InitBuffer() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| void ConvolutionOpenCLKernel::SetBlockSize() { | |||||
| auto task_size = static_cast<float>(batch_size_ * OH_ * OW_ * CO_SLICES_); | |||||
| auto task_size_per_cu = task_size / ocl_runtime_->DeviceComputeUnits(); | |||||
| int block_size; | |||||
| if (task_size_per_cu <= 256) { | |||||
| block_size = 1; | |||||
| } else if (task_size_per_cu <= 256 * 4) { | |||||
| block_size = 2; | |||||
| } else if (task_size_per_cu <= (use_fp16_ ? 256 * 8 : FLT_MAX)) { | |||||
| block_size = 4; | |||||
| } else { | |||||
| block_size = 8; | |||||
| } | |||||
| bool w_kernel_is_1 = | |||||
| KW_ == 1 && param_->stride_w_ == 1 && param_->dilation_w_ == 1 && param_->pad_l_ == 0 && param_->pad_r_ == 0; | |||||
| bool h_kernel_is_1 = | |||||
| KH_ == 1 && param_->stride_h_ == 1 && param_->dilation_h_ == 1 && param_->pad_u_ == 0 && param_->pad_d_ == 0; | |||||
| if (!w_kernel_is_1 || !h_kernel_is_1) { | |||||
| block_size = std::min(block_size, 4); | |||||
| } | |||||
| if (block_size == 8) { | |||||
| block_size_ = {2, 2, 2}; | |||||
| } else if (block_size == 4) { | |||||
| block_size_ = {2, 1, 2}; | |||||
| } else if (block_size == 2) { | |||||
| block_size_ = {2, 1, 1}; | |||||
| } else { | |||||
| block_size_ = {1, 1, 1}; | |||||
| } | |||||
| } | |||||
| void ConvolutionOpenCLKernel::SetGlobalLocal() { | |||||
| size_t global_h = batch_size_ * UP_DIV(OH_, block_size_.H); | |||||
| size_t global_w = UP_DIV(OW_, block_size_.W); | |||||
| size_t global_c = UP_DIV(CO_SLICES_, block_size_.C); | |||||
| constexpr int local_c_max = 16; | |||||
| constexpr int local_hw_max = 256; | |||||
| constexpr int OH_threshold = 100; | |||||
| constexpr int OW_threshold = 100; | |||||
| constexpr int OC_threshold = 64; | |||||
| size_t local_c = GetMaxDivisor(global_c, local_c_max); | |||||
| local_c = std::max<size_t>(local_c, 1); | |||||
| size_t local_hw = local_hw_max / local_c; | |||||
| size_t local_h; | |||||
| size_t local_w; | |||||
| if (OH_ >= OH_threshold && OW_ >= OW_threshold && CO_ <= OC_threshold) { // c -> w -> h | |||||
| local_w = std::min(global_w, local_hw); | |||||
| local_h = std::min(local_hw / local_w, global_h); | |||||
| } else { // c -> h -> w | |||||
| local_h = std::min(global_h, local_hw); | |||||
| local_w = std::min(local_hw / local_h, global_w); | |||||
| } | |||||
| global_ = {global_h, global_w, global_c}; | |||||
| local_ = {local_h, local_w, local_c}; | |||||
| } | |||||
| int ConvolutionOpenCLKernel::Run() { | int ConvolutionOpenCLKernel::Run() { | ||||
| MS_LOG(DEBUG) << this->name() << " Running!"; | MS_LOG(DEBUG) << this->name() << " Running!"; | ||||
| auto param = reinterpret_cast<ConvParameter *>(op_parameter_); | auto param = reinterpret_cast<ConvParameter *>(op_parameter_); | ||||
| @@ -292,48 +365,9 @@ int ConvolutionOpenCLKernel::Run() { | |||||
| nullptr); | nullptr); | ||||
| ocl_runtime_->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr); | ocl_runtime_->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr); | ||||
| } else { | } else { | ||||
| std::vector<size_t> global, local; | |||||
| SetGlobalLocalConv(&global, &local); | |||||
| ocl_runtime_->RunKernel(kernel_conv_, global, local, nullptr); | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local) { | |||||
| constexpr size_t work_group_size[] = {4, 4, 1}; | |||||
| auto max_work_item_sizes = ocl_runtime_->GetWorkItemSize(); | |||||
| size_t max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_conv_(), (*ocl_runtime_->Device())()); | |||||
| const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]); | |||||
| size_t global_nh = UP_DIV(batch_size_ * OH_, work_group_size[0]) * work_group_size[0]; | |||||
| size_t global_w = UP_DIV(OW_, work_group_size[1]) * work_group_size[1]; | |||||
| size_t global_c = UP_DIV(CO_SLICES_, work_group_size[2]) * work_group_size[2]; | |||||
| size_t local_c = GetMaxDivisor(global_c, max_z_size); | |||||
| if (local_c == 0) { | |||||
| MS_LOG(ERROR) << "Divide by zero"; | |||||
| return mindspore::lite::RET_ERROR; | |||||
| } | |||||
| size_t local_hw_size = std::min<size_t>(256, max_work_group_size) / local_c; | |||||
| size_t local_w = std::min(global_w, local_hw_size); | |||||
| size_t local_nh = std::min(local_hw_size / local_w, global_nh); | |||||
| if (local_nh == global_nh && global_nh % 2 == 0) { | |||||
| local_nh = global_nh / 2; | |||||
| ocl_runtime_->RunKernel(kernel_conv_, global_, local_, nullptr); | |||||
| } | } | ||||
| if (OW_ * CO_SLICES_ > MAX_IMAGE2D_SIZE) { | |||||
| local_w = 4; | |||||
| } | |||||
| global->clear(); | |||||
| global->push_back(UP_DIV(batch_size_ * OH_, local_nh) * local_nh); | |||||
| global->push_back(UP_DIV(OW_, local_w) * local_w); | |||||
| global->push_back(UP_DIV(CO_SLICES_, local_c) * local_c); | |||||
| local->clear(); | |||||
| local->push_back(local_nh); | |||||
| local->push_back(local_w); | |||||
| local->push_back(local_c); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -349,37 +383,43 @@ kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector<lite::Tenso | |||||
| conv_param->stride_w_ == 1 && conv_param->pad_u_ == 0 && conv_param->pad_d_ == 0 && | conv_param->stride_w_ == 1 && conv_param->pad_u_ == 0 && conv_param->pad_d_ == 0 && | ||||
| conv_param->pad_l_ == 0 && conv_param->pad_r_ == 0 && conv_param->dilation_h_ == 1 && | conv_param->pad_l_ == 0 && conv_param->pad_r_ == 0 && conv_param->dilation_h_ == 1 && | ||||
| conv_param->dilation_w_ == 1; | conv_param->dilation_w_ == 1; | ||||
| OpParameter *real_param; | |||||
| if (is_hw1 && is_pad_stride_ok) { | if (is_hw1 && is_pad_stride_ok) { | ||||
| auto param = static_cast<MatMulParameter *>(malloc(sizeof(MatMulParameter))); | |||||
| if (param == nullptr) { | |||||
| auto fc_param = static_cast<MatMulParameter *>(malloc(sizeof(MatMulParameter))); | |||||
| if (fc_param == nullptr) { | |||||
| MS_LOG(ERROR) << "Create OpenCL FullConnection kernel param failed!"; | MS_LOG(ERROR) << "Create OpenCL FullConnection kernel param failed!"; | ||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| param->op_parameter_.type_ = PrimitiveType_FullConnection; | |||||
| param->a_transpose_ = false; | |||||
| param->b_transpose_ = true; | |||||
| param->act_type_ = conv_param->act_type_; | |||||
| kernel = new (std::nothrow) FullConnectionOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||||
| fc_param->op_parameter_.type_ = PrimitiveType_FullConnection; | |||||
| fc_param->a_transpose_ = false; | |||||
| fc_param->b_transpose_ = true; | |||||
| fc_param->act_type_ = conv_param->act_type_; | |||||
| kernel = new (std::nothrow) FullConnectionOpenCLKernel(reinterpret_cast<OpParameter *>(fc_param), inputs, outputs); | |||||
| real_param = reinterpret_cast<OpParameter *>(fc_param); | |||||
| if (kernel == nullptr) { | if (kernel == nullptr) { | ||||
| MS_LOG(ERROR) << "Create OpenCL FullConnection kernel failed!"; | MS_LOG(ERROR) << "Create OpenCL FullConnection kernel failed!"; | ||||
| free(param); | |||||
| free(opParameter); | |||||
| free(fc_param); | |||||
| free(conv_param); | |||||
| return nullptr; | return nullptr; | ||||
| } else { | } else { | ||||
| free(opParameter); | |||||
| free(conv_param); | |||||
| } | } | ||||
| } else { | } else { | ||||
| kernel = new (std::nothrow) ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||||
| kernel = new (std::nothrow) ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(conv_param), inputs, outputs); | |||||
| real_param = reinterpret_cast<OpParameter *>(conv_param); | |||||
| if (kernel == nullptr) { | if (kernel == nullptr) { | ||||
| MS_LOG(ERROR) << "Create OpenCL Convolution kernel failed!"; | MS_LOG(ERROR) << "Create OpenCL Convolution kernel failed!"; | ||||
| free(opParameter); | |||||
| free(conv_param); | |||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| } | } | ||||
| auto ret = kernel->Init(); | auto ret = kernel->Init(); | ||||
| if (ret != mindspore::lite::RET_OK) { | if (ret != mindspore::lite::RET_OK) { | ||||
| MS_LOG(ERROR) << "Init kernel failed, name: Convolution"; | MS_LOG(ERROR) << "Init kernel failed, name: Convolution"; | ||||
| delete kernel; | delete kernel; | ||||
| free(real_param); | |||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| return kernel; | return kernel; | ||||
| @@ -31,7 +31,7 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { | |||||
| public: | public: | ||||
| ConvolutionOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | ConvolutionOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | ||||
| const std::vector<lite::Tensor *> &outputs) | const std::vector<lite::Tensor *> &outputs) | ||||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||||
| : OpenCLKernel(parameter, inputs, outputs), param_(reinterpret_cast<ConvParameter *>(parameter)) {} | |||||
| ~ConvolutionOpenCLKernel() override = default; | ~ConvolutionOpenCLKernel() override = default; | ||||
| int Init() override; | int Init() override; | ||||
| @@ -39,26 +39,32 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { | |||||
| int InitBuffer() override; | int InitBuffer() override; | ||||
| private: | private: | ||||
| void SetBlockSize(); | |||||
| void SetGlobalLocal(); | |||||
| int InitWeight(); | int InitWeight(); | ||||
| int InitBias(); | int InitBias(); | ||||
| int GenerateWinogradWeight(); | int GenerateWinogradWeight(); | ||||
| int SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local); | |||||
| size_t sizeof_FLT() const { return use_fp16_ ? sizeof(float16_t) : sizeof(float); } | |||||
| bool UseWinograd4x4To6x6() { | bool UseWinograd4x4To6x6() { | ||||
| auto param = reinterpret_cast<ConvParameter *>(op_parameter_); | |||||
| const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->stride_h_ == 1 && | |||||
| param->stride_w_ == 1 && param->pad_u_ == 1 && param->pad_d_ == 1 && param->pad_l_ == 1 && | |||||
| param->pad_r_ == 1 && param->dilation_h_ == 1 && param->dilation_w_ == 1 && IH_ == OH_ && | |||||
| IW_ == OW_ && batch_size_ == 1; | |||||
| const bool attr_valid = param_->kernel_h_ == 3 && param_->kernel_w_ == 3 && param_->stride_h_ == 1 && | |||||
| param_->stride_w_ == 1 && param_->pad_u_ == 1 && param_->pad_d_ == 1 && | |||||
| param_->pad_l_ == 1 && param_->pad_r_ == 1 && param_->dilation_h_ == 1 && | |||||
| param_->dilation_w_ == 1 && IH_ == OH_ && IW_ == OW_ && batch_size_ == 1; | |||||
| const bool channel_good = CI_SLICES_ >= 8 && CO_SLICES_ >= 8; | const bool channel_good = CI_SLICES_ >= 8 && CO_SLICES_ >= 8; | ||||
| const bool hw_good = TILES_X_ * TILES_Y_ >= 16; | const bool hw_good = TILES_X_ * TILES_Y_ >= 16; | ||||
| return attr_valid && channel_good && hw_good; | return attr_valid && channel_good && hw_good; | ||||
| } | } | ||||
| cl::Kernel kernel_4x4to36_; | |||||
| cl::Kernel kernel_conv_; | |||||
| cl::Kernel kernel_36to4x4_; | |||||
| std::vector<size_t> global_; | |||||
| std::vector<size_t> local_; | |||||
| bool use_fp16_{false}; | bool use_fp16_{false}; | ||||
| size_t sizeof_FLT_{4}; | |||||
| ConvParameter *param_{nullptr}; | |||||
| int batch_size_{}; | int batch_size_{}; | ||||
| int CI_{}; | int CI_{}; | ||||
| int IH_{}; | int IH_{}; | ||||
| @@ -81,9 +87,11 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { | |||||
| void *winograd_mem0_{nullptr}; | void *winograd_mem0_{nullptr}; | ||||
| void *winograd_mem1_{nullptr}; | void *winograd_mem1_{nullptr}; | ||||
| cl::Kernel kernel_4x4to36_; | |||||
| cl::Kernel kernel_conv_; | |||||
| cl::Kernel kernel_36to4x4_; | |||||
| struct { | |||||
| int H{1}; | |||||
| int W{1}; | |||||
| int C{1}; | |||||
| } block_size_; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -38,30 +38,29 @@ struct OpenCLToFormatParameter { | |||||
| struct Image2DInfo { | struct Image2DInfo { | ||||
| explicit Image2DInfo(const lite::Tensor *tensor) { | explicit Image2DInfo(const lite::Tensor *tensor) { | ||||
| if (tensor != nullptr) { | |||||
| auto shape = tensor->shape(); | |||||
| if (shape.size() == 1) { | |||||
| N = shape[0]; | |||||
| } else if (shape.size() == 2) { | |||||
| N = shape[0]; | |||||
| C = shape[1]; | |||||
| } else if (shape.size() == 3) { | |||||
| N = shape[0]; | |||||
| W = shape[1]; | |||||
| C = shape[2]; | |||||
| } else if (shape.size() == 4) { | |||||
| N = shape[0]; | |||||
| H = shape[1]; | |||||
| W = shape[2]; | |||||
| C = shape[3]; | |||||
| } else if (shape.size() >= 5) { | |||||
| MS_LOG(ERROR) << "GPU dont't support Tensor with dim=" << shape.size(); | |||||
| } | |||||
| FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float); | |||||
| } else { | |||||
| FLT_size = sizeof(cl_float); | |||||
| if (tensor == nullptr) { | |||||
| return; | |||||
| } | } | ||||
| auto shape = tensor->shape(); | |||||
| if (shape.size() == 1) { | |||||
| N = shape[0]; | |||||
| } else if (shape.size() == 2) { | |||||
| N = shape[0]; | |||||
| C = shape[1]; | |||||
| } else if (shape.size() == 3) { | |||||
| N = shape[0]; | |||||
| W = shape[1]; | |||||
| C = shape[2]; | |||||
| } else if (shape.size() == 4) { | |||||
| N = shape[0]; | |||||
| H = shape[1]; | |||||
| W = shape[2]; | |||||
| C = shape[3]; | |||||
| } else if (shape.size() >= 5) { | |||||
| MS_LOG(ERROR) << "GPU dont't support Tensor with dim=" << shape.size(); | |||||
| } | |||||
| FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float); | |||||
| FLT4_size = FLT_size * 4; | FLT4_size = FLT_size * 4; | ||||
| Slice = UP_DIV(C, C4NUM); | Slice = UP_DIV(C, C4NUM); | ||||
| if (W * Slice <= MAX_IMAGE2D_SIZE) { | if (W * Slice <= MAX_IMAGE2D_SIZE) { | ||||
| @@ -72,16 +71,19 @@ struct Image2DInfo { | |||||
| width = N * H * Slice; | width = N * H * Slice; | ||||
| } | } | ||||
| auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); | |||||
| int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment(); | |||||
| row_pitch = (width + alignment - 1) / alignment * alignment * FLT4_size; | |||||
| ElementsNum = N * H * W * C; | ElementsNum = N * H * W * C; | ||||
| ElementsC4Num = N * H * W * Slice * C4NUM; | ElementsC4Num = N * H * W * Slice * C4NUM; | ||||
| OriginSize = ElementsNum * FLT_size; | OriginSize = ElementsNum * FLT_size; | ||||
| Image2DSize = height * width * FLT4_size; | Image2DSize = height * width * FLT4_size; | ||||
| } | } | ||||
| size_t RowPitch() const { | |||||
| auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); | |||||
| int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment(); | |||||
| size_t row_pitch = (width + alignment - 1) / alignment * alignment * FLT4_size; | |||||
| return row_pitch; | |||||
| } | |||||
| size_t N{1}; | size_t N{1}; | ||||
| size_t H{1}; | size_t H{1}; | ||||
| size_t W{1}; | size_t W{1}; | ||||
| @@ -89,9 +91,8 @@ struct Image2DInfo { | |||||
| size_t Slice{}; | size_t Slice{}; | ||||
| size_t width{}; | size_t width{}; | ||||
| size_t height{}; | size_t height{}; | ||||
| size_t FLT_size{}; | |||||
| size_t FLT4_size{}; | |||||
| size_t row_pitch{}; | |||||
| size_t FLT_size{4}; | |||||
| size_t FLT4_size{16}; | |||||
| size_t ElementsNum{}; | size_t ElementsNum{}; | ||||
| size_t ElementsC4Num{}; | size_t ElementsC4Num{}; | ||||
| size_t OriginSize{}; | size_t OriginSize{}; | ||||
| @@ -262,7 +262,7 @@ void PrintTensor(const lite::Tensor *tensor, OpenCLMemType mem_type, int n, cons | |||||
| auto row_size = img_info.width * img_info.FLT4_size; | auto row_size = img_info.width * img_info.FLT4_size; | ||||
| for (int i = 0; i < img_info.height; ++i) { | for (int i = 0; i < img_info.height; ++i) { | ||||
| memcpy(reinterpret_cast<char *>(data.data()) + i * row_size, | memcpy(reinterpret_cast<char *>(data.data()) + i * row_size, | ||||
| static_cast<char *>(tensor->data_c()) + i * img_info.row_pitch, row_size); | |||||
| static_cast<char *>(tensor->data_c()) + i * img_info.RowPitch(), row_size); | |||||
| } | } | ||||
| } | } | ||||
| allocator->UnmapBuffer(tensor->data_c()); | allocator->UnmapBuffer(tensor->data_c()); | ||||