diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl index 9eaef6e6cd..32d63b50b2 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl @@ -30,7 +30,7 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP exp1 = exp(-data); \ data = (exp0 - exp1) / (exp0 + exp1); -#define DO_LEAKY_RELU(data) \ +#define DO_LEAKY_RELU(data, alpha) \ data.x = data.x > 0 ? data.x : data.x * alpha; \ data.y = data.y > 0 ? data.y : data.y * alpha; \ data.z = data.z > 0 ? data.z : data.z * alpha; \ @@ -87,7 +87,7 @@ __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t FLT4 exp0, exp1; DO_TANH(out_h0_w0_c0); } else if (act_type == ActivationType_LEAKY_RELU) { - DO_LEAKY_RELU(out_h0_w0_c0); + DO_LEAKY_RELU(out_h0_w0_c0, alpha); } else if (act_type == ActivationType_SIGMOID) { out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0)); } @@ -166,8 +166,8 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t DO_TANH(out_h0_w0_c0); DO_TANH(out_h1_w0_c0); } else if (act_type == ActivationType_LEAKY_RELU) { - DO_LEAKY_RELU(out_h0_w0_c0); - DO_LEAKY_RELU(out_h1_w0_c0); + DO_LEAKY_RELU(out_h0_w0_c0, alpha); + DO_LEAKY_RELU(out_h1_w0_c0, alpha); } else if (act_type == ActivationType_SIGMOID) { out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0)); out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0)); @@ -273,10 +273,10 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t DO_TANH(out_h0_w0_c1); DO_TANH(out_h1_w0_c1); } else if (act_type == ActivationType_LEAKY_RELU) { - DO_LEAKY_RELU(out_h0_w0_c0); - DO_LEAKY_RELU(out_h1_w0_c0); - DO_LEAKY_RELU(out_h0_w0_c1); - DO_LEAKY_RELU(out_h1_w0_c1); + DO_LEAKY_RELU(out_h0_w0_c0, alpha); + DO_LEAKY_RELU(out_h1_w0_c0, alpha); + DO_LEAKY_RELU(out_h0_w0_c1, alpha); + DO_LEAKY_RELU(out_h1_w0_c1, alpha); } else if (act_type == ActivationType_SIGMOID) { out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0)); out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0)); @@ -438,14 +438,202 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t DO_TANH(out_h1_w0_c1); DO_TANH(out_h1_w1_c1); } else if (act_type == ActivationType_LEAKY_RELU) { - DO_LEAKY_RELU(out_h0_w0_c0); - DO_LEAKY_RELU(out_h0_w1_c0); - DO_LEAKY_RELU(out_h1_w0_c0); - DO_LEAKY_RELU(out_h1_w1_c0); - DO_LEAKY_RELU(out_h0_w0_c1); - DO_LEAKY_RELU(out_h0_w1_c1); - DO_LEAKY_RELU(out_h1_w0_c1); - DO_LEAKY_RELU(out_h1_w1_c1); + DO_LEAKY_RELU(out_h0_w0_c0, alpha); + DO_LEAKY_RELU(out_h0_w1_c0, alpha); + DO_LEAKY_RELU(out_h1_w0_c0, alpha); + DO_LEAKY_RELU(out_h1_w1_c0, alpha); + DO_LEAKY_RELU(out_h0_w0_c1, alpha); + DO_LEAKY_RELU(out_h0_w1_c1, alpha); + DO_LEAKY_RELU(out_h1_w0_c1, alpha); + DO_LEAKY_RELU(out_h1_w1_c1, alpha); + } else if (act_type == ActivationType_SIGMOID) { + out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0)); + out_h0_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c0)); + out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0)); + out_h1_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c0)); + out_h0_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c1)); + out_h0_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c1)); + out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1)); + out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1)); + } + + if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { + WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); + WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0); + if (oh1 < OH) { + WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); + WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0); + } // end if (oh1 < OH) + if (co_slice1 < CO_SLICES) { + WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1); + WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1); + if (oh1 < OH) { + WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1); + WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1); + } // end if (oh1 < OH) + } // end if (co_slice1 < CO_SLICES) + } else { + WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow0), out_h0_w0_c0); + WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice0, ow1), out_h0_w1_c0); + if (oh1 < OH) { + WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow0), out_h1_w0_c0); + WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice0, ow1), out_h1_w1_c0); + } // end (oh1 < OH) + if (co_slice1 < CO_SLICES) { + WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice1, ow0), out_h0_w0_c1); + WRITE_IMAGE(output, (int2)(n_oh0 * CO_SLICES + co_slice1, ow1), out_h0_w1_c1); + if (oh1 < OH) { + WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice1, ow0), out_h1_w0_c1); + WRITE_IMAGE(output, (int2)(n_oh1 * CO_SLICES + co_slice1, ow1), out_h1_w1_c1); + } // end if (oh1 < OH) + } // end if (co_slice1 < CO_SLICES) + } +} + +__kernel void Conv2D_H2W2C2_Img(__read_only image2d_t input, __write_only image2d_t output, + __read_only image2d_t weight, __global FLT4 *bias, int4 input_shape, int4 output_shape, + int4 kernel_stride, int4 pad, int2 dilation, int act_type, float alpha) { + const int BlockH = 2; + const int BlockW = 2; + const int BlockC = 2; + DEFINE_ARGS; + + int oh0 = oh + 0; + int oh1 = oh + 1; + int n_oh0 = n * OH + oh0; + int n_oh1 = n * OH + oh1; + int ow0 = ow + 0; + int ow1 = ow + 1; + int co_slice0 = co_slice + 0; + int co_slice1 = co_slice + 1; + + FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h1_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h0_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h0_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h1_w0_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out_h1_w1_c1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + + int filter_offset = 0; + for (int kh = 0; kh < KH; ++kh) { + int ih0 = kh * dilationH + oh0 * strideH - padTop; + // no need to check oh1, finally write out will check (oh1 < OH) + int ih1 = kh * dilationH + oh1 * strideH - padTop; + // check ih0 and ih1 + int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; + int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; + + for (int kw = 0; kw < KW; ++kw) { + int iw0 = kw * dilationW + ow0 * strideW - padLeft; + int iw1 = (ow1 < OW) ? kw * dilationW + ow1 * strideW - padLeft : -2; + int x_idx0 = iw0 * CI_SLICES; + int x_idx1 = iw1 * CI_SLICES; + + for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { + FLT4 in_h0_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx0)); + FLT4 in_h0_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx0)); + FLT4 in_h1_w0 = READ_IMAGE(input, smp_zero, (int2)(x_idx0, y_idx1)); + FLT4 in_h1_w1 = READ_IMAGE(input, smp_zero, (int2)(x_idx1, y_idx1)); + x_idx0++; + x_idx1++; + + FLT4 filter_ci0_co0 = READ_IMAGE(weight, smp_zero, (int2)(co_slice0, filter_offset + 0)); + FLT4 filter_ci1_co0 = READ_IMAGE(weight, smp_zero, (int2)(co_slice0, filter_offset + 1)); + FLT4 filter_ci2_co0 = READ_IMAGE(weight, smp_zero, (int2)(co_slice0, filter_offset + 2)); + FLT4 filter_ci3_co0 = READ_IMAGE(weight, smp_zero, (int2)(co_slice0, filter_offset + 3)); + FLT4 filter_ci0_co1 = READ_IMAGE(weight, smp_zero, (int2)(co_slice1, filter_offset + 0)); + FLT4 filter_ci1_co1 = READ_IMAGE(weight, smp_zero, (int2)(co_slice1, filter_offset + 1)); + FLT4 filter_ci2_co1 = READ_IMAGE(weight, smp_zero, (int2)(co_slice1, filter_offset + 2)); + FLT4 filter_ci3_co1 = READ_IMAGE(weight, smp_zero, (int2)(co_slice1, filter_offset + 3)); + filter_offset += 4; + + out_h0_w0_c0 += filter_ci0_co0 * in_h0_w0.x; + out_h0_w1_c0 += filter_ci0_co0 * in_h0_w1.x; + out_h1_w0_c0 += filter_ci0_co0 * in_h1_w0.x; + out_h1_w1_c0 += filter_ci0_co0 * in_h1_w1.x; + out_h0_w0_c0 += filter_ci1_co0 * in_h0_w0.y; + out_h0_w1_c0 += filter_ci1_co0 * in_h0_w1.y; + out_h1_w0_c0 += filter_ci1_co0 * in_h1_w0.y; + out_h1_w1_c0 += filter_ci1_co0 * in_h1_w1.y; + out_h0_w0_c0 += filter_ci2_co0 * in_h0_w0.z; + out_h0_w1_c0 += filter_ci2_co0 * in_h0_w1.z; + out_h1_w0_c0 += filter_ci2_co0 * in_h1_w0.z; + out_h1_w1_c0 += filter_ci2_co0 * in_h1_w1.z; + out_h0_w0_c0 += filter_ci3_co0 * in_h0_w0.w; + out_h0_w1_c0 += filter_ci3_co0 * in_h0_w1.w; + out_h1_w0_c0 += filter_ci3_co0 * in_h1_w0.w; + out_h1_w1_c0 += filter_ci3_co0 * in_h1_w1.w; + + out_h0_w0_c1 += filter_ci0_co1 * in_h0_w0.x; + out_h0_w1_c1 += filter_ci0_co1 * in_h0_w1.x; + out_h1_w0_c1 += filter_ci0_co1 * in_h1_w0.x; + out_h1_w1_c1 += filter_ci0_co1 * in_h1_w1.x; + out_h0_w0_c1 += filter_ci1_co1 * in_h0_w0.y; + out_h0_w1_c1 += filter_ci1_co1 * in_h0_w1.y; + out_h1_w0_c1 += filter_ci1_co1 * in_h1_w0.y; + out_h1_w1_c1 += filter_ci1_co1 * in_h1_w1.y; + out_h0_w0_c1 += filter_ci2_co1 * in_h0_w0.z; + out_h0_w1_c1 += filter_ci2_co1 * in_h0_w1.z; + out_h1_w0_c1 += filter_ci2_co1 * in_h1_w0.z; + out_h1_w1_c1 += filter_ci2_co1 * in_h1_w1.z; + out_h0_w0_c1 += filter_ci3_co1 * in_h0_w0.w; + out_h0_w1_c1 += filter_ci3_co1 * in_h0_w1.w; + out_h1_w0_c1 += filter_ci3_co1 * in_h1_w0.w; + out_h1_w1_c1 += filter_ci3_co1 * in_h1_w1.w; + } + } + } + + if (bias != 0) { + out_h0_w0_c0 += bias[co_slice0]; + out_h0_w1_c0 += bias[co_slice0]; + out_h1_w0_c0 += bias[co_slice0]; + out_h1_w1_c0 += bias[co_slice0]; + out_h0_w0_c1 += bias[co_slice1]; + out_h0_w1_c1 += bias[co_slice1]; + out_h1_w0_c1 += bias[co_slice1]; + out_h1_w1_c1 += bias[co_slice1]; + } + + if (act_type == ActivationType_RELU) { + out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); + out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f)); + out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); + out_h1_w1_c0 = max(out_h1_w1_c0, (FLT4)(0.0f)); + out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f)); + out_h0_w1_c1 = max(out_h0_w1_c1, (FLT4)(0.0f)); + out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f)); + out_h1_w1_c1 = max(out_h1_w1_c1, (FLT4)(0.0f)); + } else if (act_type == ActivationType_RELU6) { + out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h1_w1_c0 = clamp(out_h1_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h0_w1_c1 = clamp(out_h0_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); + out_h1_w1_c1 = clamp(out_h1_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f)); + } else if (act_type == ActivationType_TANH) { + FLT4 exp0, exp1; + DO_TANH(out_h0_w0_c0); + DO_TANH(out_h0_w1_c0); + DO_TANH(out_h1_w0_c0); + DO_TANH(out_h1_w1_c0); + DO_TANH(out_h0_w0_c1); + DO_TANH(out_h0_w1_c1); + DO_TANH(out_h1_w0_c1); + DO_TANH(out_h1_w1_c1); + } else if (act_type == ActivationType_LEAKY_RELU) { + DO_LEAKY_RELU(out_h0_w0_c0, alpha); + DO_LEAKY_RELU(out_h0_w1_c0, alpha); + DO_LEAKY_RELU(out_h1_w0_c0, alpha); + DO_LEAKY_RELU(out_h1_w1_c0, alpha); + DO_LEAKY_RELU(out_h0_w0_c1, alpha); + DO_LEAKY_RELU(out_h0_w1_c1, alpha); + DO_LEAKY_RELU(out_h1_w0_c1, alpha); + DO_LEAKY_RELU(out_h1_w1_c1, alpha); } else if (act_type == ActivationType_SIGMOID) { out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0)); out_h0_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c0)); diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl index 295265a6ad..b9b1644c2b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl @@ -2,6 +2,7 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; +#define CI_TILE 4 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) constant FLT Bt[36] = { @@ -13,65 +14,55 @@ constant FLT Bt[36] = { 0.0000000000f, 0.9999998808f, -0.0000000596f, -2.5000000000f, 0.0000000000f, 1.0000000000f, }; -__kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output, - int4 input_shape, // N H W CI_SLICES - int4 output_shape) { // N 36 H/4*W/4 CI_SLICES -#define PAD 1 - int tile_xy = get_global_id(0); +__kernel void Winograd4x4To36(__read_only image2d_t input, // height=N*H width=W*CI_SLICES + __write_only image2d_t output, // height=CI_SLICES*36 width=H/4*W/4 + int4 input_shape, // N H W CI_SLICES + int TILE_HW, int pad) { + int tile_hw = get_global_id(0); int row = get_global_id(1); - int slice = get_global_id(2); - - int TILE_XY = output_shape.z; - int SLICES = input_shape.w; - if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES) { + int ci_slice = get_global_id(2); + int H = input_shape.y; + int W = input_shape.z; + int CI_SLICES = input_shape.w; + if (tile_hw >= TILE_HW || row >= 6 || ci_slice >= CI_SLICES) { return; } - - int IH = input_shape.y, IW = input_shape.z; - int TILE_X = UP_DIV(IW, 4); - int tile_x = tile_xy % TILE_X; - int tile_y = tile_xy / TILE_X; + int TILE_W = UP_DIV(W, 4); + int tile_w = tile_hw % TILE_W; + int tile_h = tile_hw / TILE_W; constant FLT *Bt_row = Bt + row * 6; FLT4 BtD_row[6] = {0}; - - int ih = tile_y * 4 - PAD; - int iw = tile_x * 4 - PAD; + int h = tile_h * 4 - pad; + int w = tile_w * 4 - pad; for (int y = 0; y < 6; y++) { - int x_idx = iw * SLICES + slice; + int x_idx = w * CI_SLICES + ci_slice; for (int x = 0; x < 6; x++) { - // no need to check iw: because slice is in [0, SLICES). when iw<0, x_idx<0; iw>=IW, x_idx>=IW*SLICES - // if (iw < 0 || iw >= IW) { continue; } - BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, ih)); - x_idx += SLICES; + // no need to check w: because ci_slice is in [0, CI_SLICES). when w<0, x_idx<0; w>=W, x_idx>=W*CI_SLICES + // if (w < 0 || w >= W) { continue; } + BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, h)); + x_idx += CI_SLICES; } - ih++; + h++; } - int y_idx = slice * 36 + row * 6; + int y_idx = ci_slice * 36 + row * 6; for (int y = 0; y < 6; y++) { FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); for (int x = 0; x < 6; x++) { acc += BtD_row[x] * Bt[y * 6 + x]; } - WRITE_IMAGE(output, (int2)(tile_xy, y_idx + y), acc); // CH W H=36 + WRITE_IMAGE(output, (int2)(tile_hw, y_idx + y), acc); } -#undef PAD } -__kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, - int4 input_shape, // N 36 H/4*W/4 CI_SLICES - int4 output_shape) { // N 36 H/4*W/4 CO_SLICES -#define H 36 - int w = get_global_id(0) * 2; +__kernel void WinogradConv2D(__read_only image2d_t input, // height=CI_SLICES*36 width=TILE_HW + __write_only image2d_t output, // height=CO_SLICES*36 width=TILE_HW + __global FLT16 *weight, int TILE_HW, int CI_SLICES, int CO_SLICES) { + int tile_hw = get_global_id(0) * 2; int h = get_global_id(1); int co_slice = get_global_id(2) * 2; - - int CI_SLICES = input_shape.w; - int W = input_shape.z; - int CO_SLICES = output_shape.w; - - if (h >= H || w >= W || co_slice >= CO_SLICES) { + if (h >= 36 || tile_hw >= TILE_HW || co_slice >= CO_SLICES) { return; } @@ -83,8 +74,8 @@ __kernel void WinogradConvolution(__read_only image2d_t input, __write_only imag int y_idx = h; __global FLT16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2; for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { - FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(w + 0, y_idx)); - FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(w + 1, y_idx)); + FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(tile_hw + 0, y_idx)); + FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(tile_hw + 1, y_idx)); y_idx += 36; FLT16 weight0 = weight_ptr[0], weight1 = weight_ptr[1]; @@ -111,57 +102,110 @@ __kernel void WinogradConvolution(__read_only image2d_t input, __write_only imag out11 += in1.w * weight1.scdef; } - WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 0) * H + h), out00); - if (w + 1 < W) { - WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 0) * H + h), out01); + WRITE_IMAGE(output, (int2)(tile_hw + 0, (co_slice + 0) * 36 + h), out00); + WRITE_IMAGE(output, (int2)(tile_hw + 1, (co_slice + 0) * 36 + h), out01); + WRITE_IMAGE(output, (int2)(tile_hw + 0, (co_slice + 1) * 36 + h), out10); + WRITE_IMAGE(output, (int2)(tile_hw + 1, (co_slice + 1) * 36 + h), out11); +} + +__kernel void WinogradConv2D_Img(__read_only image2d_t input, // height=CI_SLICES*36 width=TILE_HW + __write_only image2d_t output, // height=CO_SLICES*36 width=TILE_HW + __read_only image2d_t weight, int TILE_HW, int CI_SLICES, int CO_SLICES) { + int tile_hw = get_global_id(0) * 2; + int h = get_global_id(1); + int co_slice = get_global_id(2) * 2; + if (h >= 36 || tile_hw >= TILE_HW || co_slice >= CO_SLICES) { + return; } + int CI_ALIGN = CI_SLICES * CI_TILE; - if (co_slice + 1 < CO_SLICES) { - WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 1) * H + h), out10); - if (w + 1 < W) { - WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 1) * H + h), out11); - } + FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); + + int y_idx = h; + for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { + FLT4 in0 = READ_IMAGE(input, smp_zero, (int2)(tile_hw + 0, y_idx)); + FLT4 in1 = READ_IMAGE(input, smp_zero, (int2)(tile_hw + 1, y_idx)); + y_idx += 36; + + FLT4 filter_ci0_co0 = READ_IMAGE(weight, smp_zero, (int2)(h * CI_ALIGN + ci_slice * CI_TILE + 0, co_slice + 0)); + FLT4 filter_ci1_co0 = READ_IMAGE(weight, smp_zero, (int2)(h * CI_ALIGN + ci_slice * CI_TILE + 1, co_slice + 0)); + FLT4 filter_ci2_co0 = READ_IMAGE(weight, smp_zero, (int2)(h * CI_ALIGN + ci_slice * CI_TILE + 2, co_slice + 0)); + FLT4 filter_ci3_co0 = READ_IMAGE(weight, smp_zero, (int2)(h * CI_ALIGN + ci_slice * CI_TILE + 3, co_slice + 0)); + FLT4 filter_ci0_co1 = READ_IMAGE(weight, smp_zero, (int2)(h * CI_ALIGN + ci_slice * CI_TILE + 0, co_slice + 1)); + FLT4 filter_ci1_co1 = READ_IMAGE(weight, smp_zero, (int2)(h * CI_ALIGN + ci_slice * CI_TILE + 1, co_slice + 1)); + FLT4 filter_ci2_co1 = READ_IMAGE(weight, smp_zero, (int2)(h * CI_ALIGN + ci_slice * CI_TILE + 2, co_slice + 1)); + FLT4 filter_ci3_co1 = READ_IMAGE(weight, smp_zero, (int2)(h * CI_ALIGN + ci_slice * CI_TILE + 3, co_slice + 1)); + + out00 += in0.x * filter_ci0_co0; + out00 += in0.y * filter_ci1_co0; + out00 += in0.z * filter_ci2_co0; + out00 += in0.w * filter_ci3_co0; + + out01 += in1.x * filter_ci0_co0; + out01 += in1.y * filter_ci1_co0; + out01 += in1.z * filter_ci2_co0; + out01 += in1.w * filter_ci3_co0; + + out10 += in0.x * filter_ci0_co1; + out10 += in0.y * filter_ci1_co1; + out10 += in0.z * filter_ci2_co1; + out10 += in0.w * filter_ci3_co1; + + out11 += in1.x * filter_ci0_co1; + out11 += in1.y * filter_ci1_co1; + out11 += in1.z * filter_ci2_co1; + out11 += in1.w * filter_ci3_co1; } -#undef H + + WRITE_IMAGE(output, (int2)(tile_hw + 0, (co_slice + 0) * 36 + h), out00); + WRITE_IMAGE(output, (int2)(tile_hw + 1, (co_slice + 0) * 36 + h), out01); + WRITE_IMAGE(output, (int2)(tile_hw + 0, (co_slice + 1) * 36 + h), out10); + WRITE_IMAGE(output, (int2)(tile_hw + 1, (co_slice + 1) * 36 + h), out11); } +#define DO_LEAKY_RELU(data, alpha) \ + data.x = data.x > 0 ? data.x : data.x * alpha; \ + data.y = data.y > 0 ? data.y : data.y * alpha; \ + data.z = data.z > 0 ? data.z : data.z * alpha; \ + data.w = data.w > 0 ? data.w : data.w * alpha; + constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f, 0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f, 0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f, 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f}; -__kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias, - int4 input_shape, // N 36 H/4*W/4 CO_SLICES +__kernel void Winograd36To4x4(__read_only image2d_t input, // height=CO_SLICES*36 width=TILE_HW + __write_only image2d_t output, // height=N*H width=W*CO_SLICES + __global FLT4 *bias, int4 output_shape, // N H W CO_SLICES - int act_type, float alpha) { - int tile_xy = get_global_id(0); + int TILE_HW, int act_type, float alpha) { + int tile_hw = get_global_id(0); int row = get_global_id(1); - int slice = get_global_id(2); - - int TILE_XY = input_shape.z; - int SLICES = input_shape.w; - int OH = output_shape.y; - int OW = output_shape.z; - - if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES) { + int co_slice = get_global_id(2); + int H = output_shape.y; + int W = output_shape.z; + int CO_SLICES = output_shape.w; + if (tile_hw >= TILE_HW || row >= 4 || co_slice >= CO_SLICES) { return; } constant FLT *At_row = At + row * 6; FLT4 AtM_row[6] = {0}; - for (int y = 0, idx = slice * 36; y < 6; y++) { + for (int y = 0, idx = co_slice * 36; y < 6; y++) { for (int x = 0; x < 6; x++, idx++) { - AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_xy, idx)); + AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx)); } } - int TILE_X = UP_DIV(OW, 4); - int tile_x = tile_xy % TILE_X; - int tile_y = tile_xy / TILE_X; - int oh = tile_y * 4 + row; - int ow = tile_x * 4; - int x_idx = ow * SLICES + slice; - + int TILE_W = UP_DIV(W, 4); + int tile_w = tile_hw % TILE_W; + int tile_h = tile_hw / TILE_W; + int h = tile_h * 4 + row; + int w = tile_w * 4; + int x_idx = w * CO_SLICES + co_slice; for (int x = 0, idx = 0; x < 4; x++) { FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); for (int y = 0; y < 6; y++, idx++) { @@ -169,7 +213,7 @@ __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_ } if (bias != 0) { - acc += bias[slice]; + acc += bias[co_slice]; } if (act_type == ActivationType_RELU) { @@ -181,15 +225,11 @@ __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_ FLT4 exp1 = exp(-acc); acc = (exp0 - exp1) / (exp0 + exp1); } else if (act_type == ActivationType_LEAKY_RELU) { - if (acc.x < 0) acc.x *= alpha; - if (acc.y < 0) acc.y *= alpha; - if (acc.z < 0) acc.z *= alpha; - if (acc.w < 0) acc.w *= alpha; + DO_LEAKY_RELU(acc, alpha); } else if (act_type == ActivationType_SIGMOID) { acc = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-acc)); } - - WRITE_IMAGE(output, (int2)(x_idx, oh), acc); - x_idx += SLICES; + WRITE_IMAGE(output, (int2)(x_idx, h), acc); + x_idx += CO_SLICES; } } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc index 7b2673cb96..7c8f5b60cd 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc @@ -14,18 +14,18 @@ * limitations under the License. */ +#include "src/runtime/kernel/opencl/kernel/conv2d.h" #include #include #include -#include "src/common/utils.h" -#include "src/runtime/kernel/opencl/kernel/conv2d.h" -#include "src/runtime/kernel/opencl/kernel/fullconnection.h" -#include "src/runtime/kernel/opencl/utils.h" #include "src/kernel_registry.h" #include "include/errorcode.h" #include "schema/ops_generated.h" +#include "src/common/utils.h" +#include "src/runtime/kernel/opencl/utils.h" +#include "src/runtime/kernel/opencl/kernel/fullconnection.h" +#include "src/runtime/kernel/opencl/kernel/winograd.h" #include "src/runtime/kernel/opencl/cl/conv2d.cl.inc" -#include "src/runtime/kernel/opencl/cl/winograd.cl.inc" using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; @@ -41,38 +41,46 @@ using mindspore::schema::PrimitiveType_FullConnection; namespace mindspore::kernel { -const size_t CI_TILE = C4NUM; -const size_t CO_TILE = C4NUM; - int Conv2DOpenCLKernel::CheckSpecs() { - if (in_tensors_.size() != 2 && in_tensors_.size() != 3) { - MS_LOG(ERROR) << "Conv2D only supports 2 or 3 input Tensor but get " << in_tensors_.size(); + int inputs_num = in_tensors_.size(); + if (inputs_num != 2 && inputs_num != 3) { + MS_LOG(ERROR) << "Conv2D only supports 2 or 3 input Tensor but get " << inputs_num; return RET_ERROR; } - if (out_tensors_.size() != 1) { - MS_LOG(ERROR) << "Conv2D only supports 1 output Tensor but get " << out_tensors_.size(); + int outputs_num = out_tensors_.size(); + if (outputs_num != 1) { + MS_LOG(ERROR) << "Conv2D only supports 1 output Tensor but get " << outputs_num; return RET_ERROR; } - if (in_tensors_.front()->shape().size() != 4) { - MS_LOG(ERROR) << "Conv2D only supports 4D input Tensor but get " << in_tensors_.front()->shape().size() << "D."; + + int input_ndim = in_tensors_.at(kInputIndex)->shape().size(); + if (input_ndim != 4) { + MS_LOG(ERROR) << "Conv2D only supports 4D input Tensor but get " << input_ndim << "D."; return RET_ERROR; } - if (in_tensors_.at(1)->shape().size() != 4) { - MS_LOG(ERROR) << "Conv2D only supports 4D filter Tensor but get " << in_tensors_.at(1)->shape().size() << "D."; + int output_ndim = out_tensors_.at(kOutputIndex)->shape().size(); + if (output_ndim != 4) { + MS_LOG(ERROR) << "Conv2D only supports 4D output Tensor but get " << output_ndim << "D."; return RET_ERROR; } - if (out_tensors_.front()->shape().size() != 4) { - MS_LOG(ERROR) << "Conv2D only supports 4D output Tensor but get " << out_tensors_.front()->shape().size() << "D."; + + auto *filter_tensor = in_tensors_.at(kWeightIndex); + int filter_ndim = filter_tensor->shape().size(); + if (filter_ndim != 4) { + MS_LOG(ERROR) << "Conv2D only supports 4D filter Tensor but get " << filter_ndim << "D."; return RET_ERROR; } - if (!in_tensors_.at(1)->IsConst()) { + if (!filter_tensor->IsConst()) { MS_LOG(ERROR) << "Conv2D don't support non-constant filter yet."; return RET_ERROR; } - if (in_tensors_.size() == 3 && !in_tensors_.at(2)->IsConst()) { + + auto *bias_tensor = in_tensors_.size() >= 3 ? in_tensors_.at(kBiasIndex) : nullptr; + if (bias_tensor != nullptr && !bias_tensor->IsConst()) { MS_LOG(ERROR) << "Conv2D don't support non-constant bias yet."; return RET_ERROR; } + // for fusion: ActivationType_LEAKY_RELU ActivationType_TANH switch (static_cast(param_->act_type_)) { case ActType_No: @@ -90,9 +98,17 @@ int Conv2DOpenCLKernel::CheckSpecs() { } int Conv2DOpenCLKernel::Prepare() { + InitAttrs(); + BuildKernel(); + InitWeights(); + SetGlobalLocal(); + SetConstArgs(); + return RET_OK; +} + +void Conv2DOpenCLKernel::InitAttrs() { use_fp16_ = ocl_runtime_->GetFp16Enable(); sizeof_FLT_ = use_fp16_ ? sizeof(float16_t) : sizeof(float); - auto input_shape = in_tensors_.front()->shape(); auto output_shape = out_tensors_.front()->shape(); batch_size_ = input_shape[0]; @@ -109,170 +125,167 @@ int Conv2DOpenCLKernel::Prepare() { OW_ = output_shape[2]; CO_ = output_shape[3]; } - CI_SLICES_ = UP_DIV(CI_, C4NUM); - CO_SLICES_ = UP_DIV(CO_, C4NUM); + CI_SLICES_ = UP_DIV(CI_, CI_TILE); + CO_SLICES_ = UP_DIV(CO_, CO_TILE); KH_ = param_->kernel_h_; KW_ = param_->kernel_w_; has_bias_ = in_tensors_.size() == 3; + // note: TILE_HW_ is only used when use_winograd_=true + TILE_HW_ = UP_DIV(OW_, 4) * UP_DIV(OH_, 4); +} - // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true - TILES_X_ = UP_DIV(OW_, 4); - TILES_Y_ = UP_DIV(OH_, 4); - TILES_XY_ = TILES_X_ * TILES_Y_; - use_winograd_ = UseWinograd4x4To6x6(); - - // build kernel - if (use_winograd_) { - MS_LOG(DEBUG) << "use winograd"; - std::string program_name = "winograd"; - ocl_runtime_->LoadSource(program_name, GetActDefines() + winograd_source); - ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36"); - ocl_runtime_->BuildKernel(kernel_, program_name, "WinogradConvolution"); - ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4"); - } else { - SetBlockSize(); - std::string program_name = "conv2d"; - std::string kernel_name = "Conv2D_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + "C" + - std::to_string(block_size_.C); - ocl_runtime_->LoadSource(program_name, GetActDefines() + conv2d_source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); +void Conv2DOpenCLKernel::BuildKernel() { + SetBlockSize(); + std::string program_name = "conv2d"; + std::stringstream kernel_name; + kernel_name << "Conv2D_H" << block_size_.H << "W" << block_size_.W << "C" << block_size_.C; + if (filter_type_ == MemType::IMG) { + kernel_name << "_Img"; } + ocl_runtime_->LoadSource(program_name, GetActDefines() + conv2d_source); + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name.str()); +} - // allocate winograd memory - if (use_winograd_) { - auto allocator = ocl_runtime_->GetAllocator(); - size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; - - size_t size = TILES_XY_ * CI_SLICES_ * 36 * sizeof_FLT_; - size_t width = TILES_XY_; - size_t height = CI_SLICES_ * 36; - winograd_mem0_ = allocator->Malloc(size, {width, height, img_dtype}); - - size = TILES_XY_ * CO_SLICES_ * 36 * sizeof_FLT_; - width = TILES_XY_; - height = CO_SLICES_ * 36; - winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype}); +void Conv2DOpenCLKernel::SetBlockSize() { + if (filter_type_ == MemType::IMG) { + block_size_ = {2, 2, 2}; + return; + } + auto task_size = static_cast(batch_size_ * OH_ * OW_ * CO_SLICES_); + auto task_size_per_cu = task_size / ocl_runtime_->DeviceComputeUnits(); + int block_size; + if (task_size_per_cu <= 256) { + block_size = 1; + } else if (task_size_per_cu <= 256 * 4) { + block_size = 2; + } else if (task_size_per_cu <= (use_fp16_ ? 256 * 8 : FLT_MAX)) { + block_size = 4; + } else { + block_size = 8; } - auto ret = InitWeights(); - if (ret != RET_OK) { - return ret; + bool w_kernel_is_1 = + KW_ == 1 && param_->stride_w_ == 1 && param_->dilation_w_ == 1 && param_->pad_l_ == 0 && param_->pad_r_ == 0; + bool h_kernel_is_1 = + KH_ == 1 && param_->stride_h_ == 1 && param_->dilation_h_ == 1 && param_->pad_u_ == 0 && param_->pad_d_ == 0; + if (!w_kernel_is_1 || !h_kernel_is_1) { + block_size = std::min(block_size, 4); } - SetGlobalLocal(); - SetConstArgs(); - return RET_OK; -} -int Conv2DOpenCLKernel::GenerateWinogradFilter() { - const float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.0000000000, - 0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000, - 0.0000000000, 0.4999999702, 0.4999999702, 1.9999998808, 1.9999998808, 1.0000000000}; - const float G[] = {1.0000000000, 0.0000000000, 0.0000000000, 1.0000000000, 0.7071067691, 0.4999999702, - 1.0000000000, -0.7071067691, 0.4999999702, 1.0000000000, 1.4142135382, 1.9999998808, - 1.0000000000, -1.4142135382, 1.9999998808, 0.0000000000, 0.0000000000, 1.0000000000}; - - auto weight_tensor = in_tensors_.at(1); - auto origin_weight_fp32 = reinterpret_cast(weight_tensor->data_c()); - MS_ASSERT(origin_weight_fp32); - auto origin_weight_fp16 = reinterpret_cast(weight_tensor->data_c()); - MS_ASSERT(origin_weight_fp16); - std::function access_func; - if (weight_tensor->data_type() == kNumberTypeFloat32) { - access_func = [=](int idx) { return origin_weight_fp32[idx]; }; + if (block_size == 8) { + block_size_ = {2, 2, 2}; + } else if (block_size == 4) { + block_size_ = {2, 1, 2}; + } else if (block_size == 2) { + block_size_ = {2, 1, 1}; } else { - access_func = [=](int idx) { return static_cast(origin_weight_fp16[idx]); }; + block_size_ = {1, 1, 1}; } +} - // OHWI -> O66I - std::vector encoded_weight(CO_ * 6 * 6 * CI_); - for (int co = 0; co < CO_; ++co) { - for (int ci = 0; ci < CI_; ++ci) { - float in_vals[9]; - for (int kh = 0; kh < 3; ++kh) { - for (int kw = 0; kw < 3; ++kw) { - const int f_index = ((co * 3 + kh) * 3 + kw) * CI_ + ci; - in_vals[kh * 3 + kw] = access_func(f_index); - } - } +int Conv2DOpenCLKernel::InitWeights() { + InitFilter(); + if (has_bias_) { + InitBias(); + } + return RET_OK; +} - auto temp_vals = MatrixMultiply(G, in_vals, 6, 3, 3); - auto out_vals = MatrixMultiply(temp_vals.data(), Gt, 6, 3, 6); - for (int kh = 0; kh < 6; ++kh) { - for (int kw = 0; kw < 6; ++kw) { - const int f_index = ((co * 6 + kh) * 6 + kw) * CI_ + ci; - encoded_weight[f_index] = out_vals[kh * 6 + kw]; +void ConvertFilter(void *src, void *dst, TypeId src_dtype, TypeId dst_dtype, FilterFormat src_format, + FilterFormat dst_format, size_t CO, size_t KH, size_t KW, size_t CI, size_t OGroup) { + MS_ASSERT(src); + MS_ASSERT(dst); + MS_ASSERT(src_dtype == kNumberTypeFloat16 || src_dtype == kNumberTypeFloat32); + MS_ASSERT(dst_dtype == kNumberTypeFloat16 || dst_dtype == kNumberTypeFloat32); + MS_ASSERT(src_format == OHWI); + MS_ASSERT(dst_format == HWII4OO4 || dst_format == OHWIOgroupI4O4); + auto src_fp16 = reinterpret_cast(src); + auto src_fp32 = reinterpret_cast(src); + auto dst_fp16 = reinterpret_cast(dst); + auto dst_fp32 = reinterpret_cast(dst); + bool src_is_fp16 = src_dtype == kNumberTypeFloat16; + bool dst_is_fp16 = dst_dtype == kNumberTypeFloat16; + auto CI_SLICES = UP_DIV(CI, CI_TILE); + auto CO_SLICES = UP_DIV(CO, CO_TILE); + for (size_t co = 0, src_idx = 0; co < CO; ++co) { + for (size_t kh = 0; kh < KH; ++kh) { + for (size_t kw = 0; kw < KW; ++kw) { + for (size_t ci = 0; ci < CI; ++ci, ++src_idx) { + size_t dst_idx = 0; + size_t co_inner = co % CO_TILE; + size_t ci_slice = ci / CI_TILE; + size_t ci_inner = ci % CI_TILE; + if (dst_format == OHWIOgroupI4O4) { + size_t co_slice = co / (CO_TILE * OGroup); + size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE; + dst_idx = + (((((co_slice * KH + kh) * KW + kw) * CI_SLICES + ci_slice) * OGroup + group_idx) * CI_TILE + ci_inner) * + CO_TILE + + co_inner; + } else { // if(dst_format==HWII4OO4) + size_t co_slice = co / CO_TILE; + dst_idx = + ((((kh * KW + kw) * CI_SLICES + ci_slice) * CI_TILE + ci_inner) * CO_SLICES + co_slice) * CO_TILE + + co_inner; + } + if (dst_is_fp16) { + dst_fp16[dst_idx] = src_is_fp16 ? src_fp16[src_idx] : static_cast(src_fp32[src_idx]); + } else { + dst_fp32[dst_idx] = src_is_fp16 ? static_cast(src_fp16[src_idx]) : src_fp32[src_idx]; + } } } } } - - if (use_fp16_) { - ConvertConvWeight4DTo7D(reinterpret_cast(encoded_weight.data()), packed_weight_, CO_, 6, - 6, CI_, 2); - } else { - ConvertConvWeight4DTo7D(reinterpret_cast(encoded_weight.data()), packed_weight_, CO_, 6, 6, - CI_, 2); - } - - return RET_OK; } -int Conv2DOpenCLKernel::InitFilter() { +void Conv2DOpenCLKernel::InitFilter() { auto allocator = ocl_runtime_->GetAllocator(); + auto ret = DequantWeight(); if (ret != RET_OK) { - return ret; - } - - // allocate memory - size_t packed_weight_size; - if (use_winograd_) { - packed_weight_size = UP_DIV(CO_, 8) * 6 * 6 * CI_SLICES_ * 2 * CI_TILE * CO_TILE * sizeof_FLT_; + return; + } + + // allocate opencl memory: buffer or image2d + size_t size = 0; + int Ogroup = block_size_.C; + if (filter_type_ == MemType::IMG) { + size_t width = CO_SLICES_; + size_t height = KH_ * KW_ * UP_ROUND(CI_, CI_TILE); + size_t dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; + size = width * height * CO_TILE * sizeof_FLT_; + packed_filter_ = allocator->Malloc(size, {width, height, dtype}); + } else { + size = UP_DIV(CO_SLICES_, Ogroup) * KH_ * KW_ * CI_SLICES_ * Ogroup * CI_TILE * CO_TILE * sizeof_FLT_; + packed_filter_ = allocator->Malloc(size); + } + + // rearrange filter + auto filter_tensor = in_tensors_.at(1); + void *src_data = filter_tensor->data_c(); + auto src_dtype = filter_tensor->data_type(); + auto dst_dtype = use_fp16_ ? kNumberTypeFloat16 : kNumberTypeFloat32; + std::vector tmp(size, 0); + if (filter_type_ == MemType::IMG) { + ConvertFilter(src_data, tmp.data(), src_dtype, dst_dtype, OHWI, HWII4OO4, CO_, KH_, KW_, CI_); } else { - packed_weight_size = UP_ROUND(CO_SLICES_, block_size_.C) * KH_ * KW_ * CI_SLICES_ * CI_TILE * CO_TILE * sizeof_FLT_; + ConvertFilter(src_data, tmp.data(), src_dtype, dst_dtype, OHWI, OHWIOgroupI4O4, CO_, KH_, KW_, CI_, Ogroup); } - packed_weight_ = allocator->Malloc(packed_weight_size); - allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true); - memset(packed_weight_, 0x00, packed_weight_size); - // rearrange weight - if (use_winograd_) { - GenerateWinogradFilter(); + // unmap + if (filter_type_ == MemType::IMG) { + ocl_runtime_->WriteImage(packed_filter_, tmp.data()); } else { - auto weight_tensor = in_tensors_.at(1); - if (weight_tensor->data_type() == kNumberTypeFloat16) { - if (use_fp16_) { - ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_, - block_size_.C); - } else { - ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_, - block_size_.C); - } - } else if (weight_tensor->data_type() == kNumberTypeFloat32) { - if (use_fp16_) { - ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_, - block_size_.C); - } else { - ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_, - block_size_.C); - } - } else { // int8 or int16 - if (use_fp16_) { - ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_, - block_size_.C); - } else { - ConvertConvWeight4DTo7D(weight_tensor->data_c(), packed_weight_, CO_, KH_, KW_, CI_, - block_size_.C); - } - } + allocator->MapBuffer(packed_filter_, CL_MAP_WRITE, nullptr, true); + memcpy(packed_filter_, tmp.data(), size); + allocator->UnmapBuffer(packed_filter_); } - allocator->UnmapBuffer(packed_weight_); FreeDequantedWeight(); - return RET_OK; } -int Conv2DOpenCLKernel::InitBias() { +void Conv2DOpenCLKernel::InitBias() { auto allocator = ocl_runtime_->GetAllocator(); // align bias from C to C4 @@ -306,95 +319,64 @@ int Conv2DOpenCLKernel::InitBias() { } } allocator->UnmapBuffer(packed_bias_); - return RET_OK; } -int Conv2DOpenCLKernel::InitWeights() { - InitFilter(); - if (has_bias_) { - InitBias(); - } - return RET_OK; -} - -void Conv2DOpenCLKernel::SetBlockSize() { - auto task_size = static_cast(batch_size_ * OH_ * OW_ * CO_SLICES_); - auto task_size_per_cu = task_size / ocl_runtime_->DeviceComputeUnits(); - int block_size; - if (task_size_per_cu <= 256) { - block_size = 1; - } else if (task_size_per_cu <= 256 * 4) { - block_size = 2; - } else if (task_size_per_cu <= (use_fp16_ ? 256 * 8 : FLT_MAX)) { - block_size = 4; - } else { - block_size = 8; - } - - bool w_kernel_is_1 = - KW_ == 1 && param_->stride_w_ == 1 && param_->dilation_w_ == 1 && param_->pad_l_ == 0 && param_->pad_r_ == 0; - bool h_kernel_is_1 = - KH_ == 1 && param_->stride_h_ == 1 && param_->dilation_h_ == 1 && param_->pad_u_ == 0 && param_->pad_d_ == 0; - if (!w_kernel_is_1 || !h_kernel_is_1) { - block_size = std::min(block_size, 4); - } - - if (block_size == 8) { - block_size_ = {2, 2, 2}; - } else if (block_size == 4) { - block_size_ = {2, 1, 2}; - } else if (block_size == 2) { - block_size_ = {2, 1, 1}; - } else { - block_size_ = {1, 1, 1}; - } +void Conv2DOpenCLKernel::SetConstArgs() { + cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_}; + cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_}; + cl_int4 kernel_stride = {KH_, KW_, param_->stride_h_, param_->stride_w_}; + cl_int4 pad = {param_->pad_u_, param_->pad_d_, param_->pad_l_, param_->pad_r_}; + cl_int2 dilation = {param_->dilation_h_, param_->dilation_w_}; + + int arg_cn = 2; + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_filter_, filter_type_); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, MemType::BUF); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param_->act_type_); + ocl_runtime_->SetKernelArg(kernel_, arg_cn, alpha_); } -void AlignWinogradGlobalLocal(const std::vector &global, const std::vector &local, cl::NDRange *global_range, - cl::NDRange *local_range) { - *local_range = cl::NDRange(local[0], local[1], local[2]); - *global_range = - cl::NDRange(UP_ROUND(global[0], local[0]), UP_ROUND(global[1], local[1]), UP_ROUND(global[2], local[2])); +void Conv2DOpenCLKernel::SetGlobalLocal() { + size_t global_h = batch_size_ * UP_DIV(OH_, block_size_.H); + size_t global_w = UP_DIV(OW_, block_size_.W); + size_t global_c = UP_DIV(CO_SLICES_, block_size_.C); + int local_max = filter_type_ == MemType::IMG ? 64 : 128; + const int local_c_max = 16; + const int OH_threshold = 100; + const int OW_threshold = 100; + const int OC_threshold = 64; + size_t local_c = GetMaxDivisor(global_c, local_c_max); + local_c = std::max(local_c, 1); + size_t local_hw = local_max / local_c; + size_t local_h; + size_t local_w; + if (OH_ >= OH_threshold && OW_ >= OW_threshold && CO_ <= OC_threshold) { // c -> w -> h + local_w = std::min(global_w, local_hw); + local_h = std::min(local_hw / local_w, global_h); + } else { // c -> h -> w + local_h = std::min(global_h, local_hw); + local_w = std::min(local_hw / local_h, global_w); + } + global_size_ = {global_h, global_w, global_c}; + local_size_ = {local_h, local_w, local_c}; + AlignGlobalLocal(global_size_, local_size_); } -void Conv2DOpenCLKernel::SetGlobalLocal() { - if (use_winograd_) { - AlignWinogradGlobalLocal({TILES_XY_, 6, CI_SLICES_}, {8, 6, 4}, &global_4x4to36_, &local_4x4to36_); - AlignWinogradGlobalLocal({UP_DIV(TILES_XY_, 2), 36, UP_DIV(CO_SLICES_, 2)}, {8, 6, 2}, &global_conv_, &local_conv_); - AlignWinogradGlobalLocal({TILES_XY_, 4, CO_SLICES_}, {32, 4, 2}, &global_36to4x4_, &local_36to4x4_); - } else { - size_t global_h = batch_size_ * UP_DIV(OH_, block_size_.H); - size_t global_w = UP_DIV(OW_, block_size_.W); - size_t global_c = UP_DIV(CO_SLICES_, block_size_.C); - const int local_c_max = 16; - const int local_hw_max = 256; - const int OH_threshold = 100; - const int OW_threshold = 100; - const int OC_threshold = 64; - size_t local_c = GetMaxDivisor(global_c, local_c_max); - local_c = std::max(local_c, 1); - size_t local_hw = local_hw_max / local_c; - size_t local_h; - size_t local_w; - if (OH_ >= OH_threshold && OW_ >= OW_threshold && CO_ <= OC_threshold) { // c -> w -> h - local_w = std::min(global_w, local_hw); - local_h = std::min(local_hw / local_w, global_h); - } else { // c -> h -> w - local_h = std::min(global_h, local_hw); - local_w = std::min(local_hw / local_h, global_w); - } - global_size_ = {global_h, global_w, global_c}; - local_size_ = {local_h, local_w, local_c}; - AlignGlobalLocal(global_size_, local_size_); - } +int Conv2DOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; + ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()); + ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); + return RET_OK; } std::vector Conv2DOpenCLKernel::GenerateTuningParam() { // don't need to tune local_c std::vector tuning_params = {}; - if (use_winograd_) { - return tuning_params; - } BaseTuningParameter default_tuning_param = BaseTuningParameter(); default_tuning_param.local_size = local_size_; tuning_params.push_back(default_tuning_param); @@ -420,85 +402,6 @@ std::vector Conv2DOpenCLKernel::GenerateTuningParam() { return tuning_params; } -std::string Conv2DOpenCLKernel::Key() { - auto key = OpenCLKernel::Key(); - key += "_" + std::to_string(KH_) + "_" + std::to_string(KW_) + "_" + std::to_string(param_->stride_h_) + "_" + - std::to_string(param_->stride_w_) + "_" + std::to_string(param_->dilation_h_) + "_" + - std::to_string(param_->dilation_w_); - return key; -} - -void Conv2DOpenCLKernel::SetConstArgs() { - cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_}; - cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_}; - - int arg_cn; - if (use_winograd_) { - arg_cn = 1; - cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY_, CI_SLICES_}; - ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_); - ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, input_shape); - ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn, _4x4to36_out_shape); - - arg_cn = 0; - cl_int4 conv_in_shape = {1, 36, TILES_XY_, CI_SLICES_}; - cl_int4 conv_out_shape = {1, 36, TILES_XY_, CO_SLICES_}; - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem0_); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem1_); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, conv_in_shape); - ocl_runtime_->SetKernelArg(kernel_, arg_cn, conv_out_shape); - - arg_cn = 2; - cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_}; - ocl_runtime_->SetKernelArg(kernel_36to4x4_, 0, winograd_mem1_); - ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); - ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape); - ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape); - ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, static_cast(param_->act_type_)); - ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, static_cast(alpha_)); - } else { - arg_cn = 2; - cl_int4 kernel_stride = {KH_, KW_, param_->stride_h_, param_->stride_w_}; - cl_int4 pad = {param_->pad_u_, param_->pad_d_, param_->pad_l_, param_->pad_r_}; - cl_int2 dilation = {param_->dilation_h_, param_->dilation_w_}; - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, static_cast(param_->act_type_)); - ocl_runtime_->SetKernelArg(kernel_, arg_cn, static_cast(alpha_)); - } -} - -int Conv2DOpenCLKernel::Tune() { - if (use_winograd_) { - return RET_OK; - } - return OpenCLKernel::Tune(); -} - -int Conv2DOpenCLKernel::Run() { - MS_LOG(DEBUG) << this->name() << " Running!"; - if (use_winograd_) { - ocl_runtime_->SetKernelArg(kernel_4x4to36_, 0, in_tensors_.front()->data_c()); - ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_); - - ocl_runtime_->RunKernel(kernel_, global_conv_, local_conv_); - - ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c()); - ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_); - } else { - ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()); - ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); - } - return RET_OK; -} - bool UseFcReplaceConv(const std::vector &inputs, const std::vector &outputs, ConvParameter *param) { MS_ASSERT(param); @@ -528,6 +431,51 @@ OpParameter *CreateFcParam(const ConvParameter *conv_param) { return reinterpret_cast(fc_param); } +bool UseWinograd4x4To6x6(const ConvParameter *param, const std::vector &inputs, + const std::vector &outputs) { + // not use winograd on adreno gpu + lite::opencl::OpenCLRuntimeWrapper runtime_wrap; + lite::opencl::OpenCLRuntime *runtime = runtime_wrap.GetInstance(); + if (runtime->GetGpuInfo().type == lite::opencl::GpuType::ADRENO) { + return false; + } + + if (!(inputs.size() == 2 || inputs.size() == 3) || outputs.empty()) { + return false; + } + auto input_shape = inputs.front()->shape(); + auto output_shape = outputs.front()->shape(); + if (input_shape.size() != 4 || (output_shape.size() != 2 && output_shape.size() != 4)) { + return false; + } + int batch_size = input_shape[0]; + int IH = input_shape[1]; + int IW = input_shape[2]; + int CI = input_shape[3]; + int OH = output_shape.size() == 2 ? 1 : output_shape[1]; + int OW = output_shape.size() == 2 ? 1 : output_shape[2]; + int CO = output_shape.size() == 2 ? output_shape[1] : output_shape[3]; + int CI_SLICES = UP_DIV(CI, CI_TILE); + int CO_SLICES = UP_DIV(CO, CO_TILE); + int TILE_HW_ = UP_DIV(OH, 4) * UP_DIV(OW, 4); + + bool pad_is_all_0 = param->pad_u_ == 0 && param->pad_d_ == 0 && param->pad_l_ == 0 && param->pad_r_ == 0; + bool pad_is_all_1 = param->pad_u_ == 1 && param->pad_d_ == 1 && param->pad_l_ == 1 && param->pad_r_ == 1; + bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->stride_h_ == 1 && param->stride_w_ == 1 && + param->dilation_h_ == 1 && param->dilation_w_ == 1 && (pad_is_all_0 || pad_is_all_1); + + bool shape_valid = false; + if (pad_is_all_1) { + shape_valid = batch_size == 1 && IH == OH && IW == OW; + } else if (pad_is_all_0) { + shape_valid = batch_size == 1 && IH - 2 == OH && IW - 2 == OW; + } + + bool channel_good = CI_SLICES >= 8 && CO_SLICES >= 8; + bool hw_good = TILE_HW_ >= 16; + return attr_valid && shape_valid && channel_good && hw_good; +} + kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector &inputs, const std::vector &outputs, OpParameter *opParameter, const lite::InnerContext *ctx, const kernel::KernelKey &desc, @@ -549,7 +497,12 @@ kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector(conv_param), inputs, outputs); + if (UseWinograd4x4To6x6(conv_param, inputs, outputs)) { + MS_LOG(DEBUG) << "use Winograd algorithm."; + kernel = new (std::nothrow) WinogradOpenCLKernel(reinterpret_cast(conv_param), inputs, outputs); + } else { + kernel = new (std::nothrow) Conv2DOpenCLKernel(reinterpret_cast(conv_param), inputs, outputs); + } real_param = reinterpret_cast(conv_param); if (kernel == nullptr) { MS_LOG(ERROR) << "Create Convolution kernel failed."; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h index 60bbd0ccdb..8a01b8f4cc 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h @@ -27,53 +27,57 @@ namespace mindspore::kernel { +using lite::opencl::MemType; + +constexpr size_t CI_TILE = C4NUM; +constexpr size_t CO_TILE = C4NUM; + +enum FilterFormat { + OHWI, // CO KH KW CI + HWII4OO4, // KH KW CI/CI_TILE CI_TILE CO/CO_TILE CO_TILE + OHWIOgroupI4O4, // CO/Ogroup/CO_TILE KH KW CI/CI_TILE Ogroup CI_TILE CO_TILE +}; + +void ConvertFilter(void *src, void *dst, TypeId src_dtype, TypeId dst_dtype, FilterFormat src_format, + FilterFormat dst_format, size_t CO, size_t KH, size_t KW, size_t CI, size_t OGroup = 1); + class Conv2DOpenCLKernel : public OpenCLKernel { public: Conv2DOpenCLKernel(OpParameter *parameter, const std::vector &inputs, const std::vector &outputs) - : OpenCLKernel(parameter, inputs, outputs), param_(reinterpret_cast(parameter)) {} + : OpenCLKernel(parameter, inputs, outputs), param_(reinterpret_cast(parameter)) { + bool is_adreno = ocl_runtime_->GetGpuInfo().type == lite::opencl::GpuType::ADRENO; + filter_type_ = is_adreno ? MemType::IMG : MemType::BUF; + } ~Conv2DOpenCLKernel() override = default; int CheckSpecs() override; - int Prepare() override; - void SetGlobalLocal() override; int InitWeights() override; void SetConstArgs() override; - + void SetGlobalLocal() override; int Run() override; - int Tune() override; + + std::string Key() override { + auto key = OpenCLKernel::Key(); + key += "_" + std::to_string(KH_) + "_" + std::to_string(KW_) + "_" + std::to_string(param_->stride_h_) + "_" + + std::to_string(param_->stride_w_) + "_" + std::to_string(param_->dilation_h_) + "_" + + std::to_string(param_->dilation_w_); + return key; + } std::vector GenerateTuningParam() override; - std::string Key() override; + int Tune() override { return OpenCLKernel::Tune(); } // for opencl fusion: Conv2D + PReLU(weight is scalar) -> param_.act_type=ActivationType_LEAKY_RELU float alpha_{0.0f}; - private: - void SetBlockSize(); - int InitFilter(); - int InitBias(); - int GenerateWinogradFilter(); - - bool UseWinograd4x4To6x6() { - const bool attr_valid = param_->kernel_h_ == 3 && param_->kernel_w_ == 3 && param_->stride_h_ == 1 && - param_->stride_w_ == 1 && param_->pad_u_ == 1 && param_->pad_d_ == 1 && - param_->pad_l_ == 1 && param_->pad_r_ == 1 && param_->dilation_h_ == 1 && - param_->dilation_w_ == 1 && IH_ == OH_ && IW_ == OW_ && batch_size_ == 1; - const bool channel_good = CI_SLICES_ >= 8 && CO_SLICES_ >= 8; - const bool hw_good = TILES_X_ * TILES_Y_ >= 16; - return attr_valid && channel_good && hw_good; - } - - cl::Kernel kernel_4x4to36_; - cl::Kernel kernel_36to4x4_; - cl::NDRange global_4x4to36_, local_4x4to36_; - cl::NDRange global_conv_, local_conv_; - cl::NDRange global_36to4x4_, local_36to4x4_; - + protected: + void InitAttrs(); + virtual void BuildKernel(); + virtual void InitFilter(); + void InitBias(); bool use_fp16_{false}; size_t sizeof_FLT_{4}; - ConvParameter *param_{nullptr}; int batch_size_{}; int CI_{}; @@ -86,23 +90,21 @@ class Conv2DOpenCLKernel : public OpenCLKernel { int CO_SLICES_{}; int KH_{}; int KW_{}; - void *packed_weight_{nullptr}; + void *packed_filter_{nullptr}; void *packed_bias_{nullptr}; + MemType filter_type_{MemType::BUF}; bool has_bias_{false}; + int TILE_HW_{}; - bool use_winograd_{false}; - int TILES_X_{}; - int TILES_Y_{}; - int TILES_XY_{}; - void *winograd_mem0_{nullptr}; - void *winograd_mem1_{nullptr}; - + private: + void SetBlockSize(); struct { int H{1}; int W{1}; int C{1}; } block_size_; }; + } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONV2D_H_ diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc index 032108a94f..e3ec7c5f50 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc @@ -14,3 +14,420 @@ * limitations under the License. */ #include "src/runtime/kernel/opencl/kernel/fusion_eltwise.h" +#include +#include "src/runtime/kernel/opencl/utils.h" +#include "include/errorcode.h" +#include "nnacl/fp32/activation_fp32.h" + +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; + +namespace mindspore::kernel { + +static std::set SupportedOperators = { + // Arithmetic Primitive + Operator_Mul, + Operator_Add, + Operator_Sub, + Operator_Div, + // ArithmeticSelf Primitive + Operator_Neg, + // Other Primitive + Operator_Scale, + // Activation + Operator_Act_NO_ACTIVATION, + Operator_Act_RELU, + Operator_Act_SIGMOID, + Operator_Act_RELU6, + Operator_Act_RELU1, + Operator_Act_TANH, +}; + +std::pair CheckSupportOrCreateParam( + LiteKernel *node, bool create_param = false, + const std::map &replace_map = {}) { + MS_ASSERT(node); + MS_ASSERT(param); + PrimitiveType node_type = node->Type(); + auto operator_ = static_cast(node_type); + auto *op_parameter = reinterpret_cast(node)->GetParameter(); + bool support = false; + FusionEltwiseParameter *param = nullptr; + + if (node_type == PrimitiveType_FusionEltwise) { + support = true; + if (create_param) { + auto *eltwise = reinterpret_cast(node); + param = reinterpret_cast(eltwise->GetParameter()); + eltwise->ClearParameter(); + } + } else if (IsArithmetic(node_type)) { + auto act_type = + static_cast(reinterpret_cast(op_parameter)->activation_type_); + EltwiseOperator act_operator = Activation2Operator(act_type); + support = + node->in_tensors().size() == 2 && SupportedOperators.count(operator_) && SupportedOperators.count(act_operator); + if (create_param) { + param = new (std::nothrow) FusionEltwiseParameter(operator_, node->name(), node->in_tensors(), replace_map); + MS_ASSERT(param); + if (act_operator != Operator_Act_NO_ACTIVATION) { + std::string act_name = schema::EnumNameActivationType(act_type); + auto *fake_tensor = reinterpret_cast(param); + param = + new (std::nothrow) FusionEltwiseParameter(act_operator, act_name, {fake_tensor}, {{fake_tensor, param}}); + MS_ASSERT(param); + } + } + } else if (IsArithmeticSelf(node_type)) { + support = node->in_tensors().size() == 1 && SupportedOperators.count(operator_); + if (create_param) { + param = new (std::nothrow) FusionEltwiseParameter(operator_, node->name(), node->in_tensors(), replace_map); + MS_ASSERT(param); + } + } else if (node_type == schema::PrimitiveType_Scale) { + support = node->in_tensors().size() == 3 && SupportedOperators.count(operator_); + if (create_param) { + param = new (std::nothrow) FusionEltwiseParameter(operator_, node->name(), node->in_tensors(), replace_map); + MS_ASSERT(param); + } + } else if (node_type == schema::PrimitiveType_Activation) { + auto act_type = static_cast(reinterpret_cast(op_parameter)->type_); + EltwiseOperator act_operator = Activation2Operator(act_type); + support = node->in_tensors().size() == 1 && SupportedOperators.count(act_operator); + if (create_param) { + param = new (std::nothrow) FusionEltwiseParameter(act_operator, node->name(), node->in_tensors(), replace_map); + MS_ASSERT(param); + } + } + return {support, param}; +} + +bool IsOperatorSupported(LiteKernel *node) { return CheckSupportOrCreateParam(node).first; } + +FusionEltwiseParameter *CreateFusionEltwiseParameter( + LiteKernel *node, const std::map &replace_map) { + return CheckSupportOrCreateParam(node, true, replace_map).second; +} + +bool IsEltwiseAndOperatorSupported(LiteKernel *node) { + MS_ASSERT(node); + if (!IsOperatorSupported(node)) { + return false; + } + if (node->out_tensors().size() != 1) { + return false; + } + auto *output_tensor = node->out_tensors().front(); + MS_ASSERT(output_tensor); + auto output_info = GpuTensorInfo(output_tensor); + auto output_shape = output_tensor->shape(); + for (auto *in_tensor : node->in_tensors()) { + MS_ASSERT(in_tensor); + auto shape = in_tensor->shape(); + bool is_scalar = shape.empty() || (shape.size() == 1 && shape.front() == 1); + bool is_vector = shape.size() == 1 && shape.front() == output_info.C; + bool _111C = shape.size() == 4 && shape[0] == 1 && shape[1] == 1 && shape[2] == 1 && shape[3] == output_info.C; + bool same_with_out = shape == output_shape; + if (!(is_scalar || is_vector || _111C || same_with_out)) { + return false; + } + if (in_tensor->data_type() != kNumberTypeFloat16 && in_tensor->data_type() != kNumberTypeFloat32) { + return false; + } + } + if (output_tensor->data_type() != kNumberTypeFloat16 && output_tensor->data_type() != kNumberTypeFloat32) { + return false; + } + return true; +} + +int FusionEltwiseOpenCLKernel::Prepare() { + static std::set code_map; + std::string source = Codegen(); + code_map.insert(source); + // std::cout << name() << "\n" << source; + + std::string program_name = "FusionEltwise" + std::to_string(code_map.size()); + std::string kernel_name = "FusionEltwise"; + ocl_runtime_->LoadSource(program_name, source); + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + + InitWeights(); + SetGlobalLocal(); + SetConstArgs(); + return RET_OK; +} + +template +void CopyNumber(void *dst, void *src, size_t n) { + MS_ASSERT(dst); + MS_ASSERT(src); + if (sizeof(DstT) == sizeof(SrcT)) { + memcpy(dst, src, n * sizeof(DstT)); + } else { + auto *dst_ = static_cast(dst); + auto *src_ = static_cast(src); + for (int i = 0; i < n; ++i) { + dst_[i] = static_cast(src_[i]); + } + } +} + +int FusionEltwiseOpenCLKernel::InitWeights() { + auto allocator = ocl_runtime_->GetAllocator(); + bool use_fp16 = ocl_runtime_->GetFp16Enable(); + for (auto *tensor : in_tensors_) { + MS_ASSERT(tensor); + if (tensor->IsConst()) { + if (IsScalar(tensor->shape())) { + float value = (tensor->data_type() == kNumberTypeFloat16) ? *(reinterpret_cast(tensor->data_c())) + : *(reinterpret_cast(tensor->data_c())); + // std::cout << "value=" << value << std::endl; + scalar_weights_.push_back(value); + } else { + auto tensor_info = GpuTensorInfo(tensor); + size_t num = tensor_info.ElementsNum; + size_t size = tensor_info.Image2DSize; + void *buffer = allocator->Malloc(size); + allocator->MapBuffer(buffer, CL_MAP_WRITE, nullptr, true); + memset(buffer, 0x00, size); + if (tensor->data_type() == kNumberTypeFloat16) { + if (use_fp16) { + CopyNumber(buffer, tensor->data_c(), num); + } else { + CopyNumber(buffer, tensor->data_c(), num); + } + } else { + if (use_fp16) { + CopyNumber(buffer, tensor->data_c(), num); + } else { + CopyNumber(buffer, tensor->data_c(), num); + } + } + allocator->UnmapBuffer(buffer); + buffer_weights_.push_back(buffer); + } + } + } + return RET_OK; +} + +void FusionEltwiseOpenCLKernel::SetGlobalLocal() { + auto output = GpuTensorInfo(out_tensors_.front()); + global_size_ = {output.N * output.H, output.W, output.Slice}; + local_size_ = {}; + AlignGlobalLocal(global_size_, local_size_); +} + +void FusionEltwiseOpenCLKernel::SetConstArgs() { + auto output = GpuTensorInfo(out_tensors_.front()); + cl_int4 output_shape = {static_cast(output.N), static_cast(output.H), static_cast(output.W), + static_cast(output.C)}; + int arg_idx = 0; + int scalar_idx = 0; + int buffer_idx = 0; + for (auto *in_tensor : in_tensors_) { + MS_ASSERT(in_tensor); + if (in_tensor->IsConst()) { + if (IsScalar(in_tensor->shape())) { + if (ocl_runtime_->GetFp16Enable()) { + auto value = static_cast(scalar_weights_[scalar_idx++]); + ocl_runtime_->SetKernelArg(kernel_, arg_idx, *(reinterpret_cast(&value))); + } else { + ocl_runtime_->SetKernelArg(kernel_, arg_idx, scalar_weights_[scalar_idx++]); + } + } else { + ocl_runtime_->SetKernelArg(kernel_, arg_idx, buffer_weights_[buffer_idx++], lite::opencl::MemType::BUF); + } + } + arg_idx++; // for act input + } + arg_idx++; // for output + ocl_runtime_->SetKernelArg(kernel_, arg_idx, output_shape); +} + +int FusionEltwiseOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; + int arg_idx = 0; + for (auto *in_tensor : in_tensors_) { + if (!in_tensor->IsConst()) { + ocl_runtime_->SetKernelArg(kernel_, arg_idx, in_tensor->data_c()); + } + arg_idx++; + } + ocl_runtime_->SetKernelArg(kernel_, arg_idx, out_tensors_.front()->data_c()); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); + return RET_OK; +} + +std::string FusionEltwiseOpenCLKernel::Codegen() { + std::stringstream code; + code << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" + "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n" + "__kernel void FusionEltwise("; + + for (int i = 0; i < in_tensors_.size(); ++i) { + MS_ASSERT(in_tensors_[i]); + if (in_tensors_[i]->IsConst()) { + if (IsScalar(in_tensors_[i]->shape())) { + code << "FLT in" << i << ", "; + } else { + code << "__global FLT4 *input" << i << ", "; + } + } else { + code << "__read_only image2d_t input" << i << ", "; + } + } + + code << "__write_only image2d_t output, int4 output_shape) {\n" + " int N = output_shape.x, H = output_shape.y, W = output_shape.z, C = output_shape.w;\n" + " int SLICES = (C + 3) / 4;\n" + " int nh = get_global_id(0);\n" + " int w = get_global_id(1);\n" + " int slice = get_global_id(2);\n" + " int n = nh / H;\n" + " int h = nh % H;\n" + " if (n >= N || h >= H || w >= W || slice >= SLICES) {\n" + " return;\n" + " }\n"; + + auto output = GpuTensorInfo(out_tensors_.front()); + for (int i = 0; i < in_tensors_.size(); ++i) { + auto *tensor = in_tensors_[i]; + MS_ASSERT(tensor); + auto shape = in_tensors_[i]->shape(); + bool is_scalar = IsScalar(shape); + bool is_vector = shape.size() == 1 && shape.front() == output.C; + bool _111C = shape.size() == 4 && shape[0] == 1 && shape[1] == 1 && shape[2] == 1 && shape[3] == output.C; + if (tensor->IsConst()) { + if (!is_scalar) { + code << " FLT4 in" << i << " = input" << i << "["; + if (is_vector || _111C) { + code << "slice"; + } else { + code << "(nh * W + w) * SLICES + slice"; + } + code << "];\n"; + } + } else { + code << " FLT4 in" << i << " = READ_IMAGE(input" << i << ", smp_zero, (int2)("; + if (is_scalar) { + code << "0, 0"; + } else if (is_vector || _111C) { + code << "slice, 0"; + } else { + code << "w * SLICES + slice, nh"; + } + code << "));\n"; + } + } + code << "\n"; + MS_LOG(DEBUG) << "\n" << reinterpret_cast(op_parameter_)->name_ << ":"; + code << CodegenCore(reinterpret_cast(op_parameter_)); + code << "\n WRITE_IMAGE(output, (int2)(w * SLICES + slice, nh), out);\n" + "}\n\n"; + return code.str(); +} + +std::string FusionEltwiseOpenCLKernel::CodegenCore(FusionEltwiseParameter *param, const std::string &out_name, + int degree) { + std::stringstream code; + std::string log_prefix(degree * 2, ' '); + std::string cl_prefix((degree + 1) * 2, ' '); + + std::vector input_names; + MS_ASSERT(param); + for (const auto &input : param->inputs_) { + if (input.is_leaf_) { + input_names.push_back("in" + std::to_string(GetTensorIdx(reinterpret_cast(input.value_)))); + MS_LOG(DEBUG) << log_prefix << degree << " Tensor=" << input.value_; + } else { + std::string var = GetFormatVarName(input.name_); + input_names.push_back(var); + MS_LOG(DEBUG) << log_prefix << degree << " Parameter(degree=" << degree << ")"; + code << CodegenCore(input.value_, var, degree + 1); + } + } + const std::string &var0 = input_names.at(0); + + static std::map simple_symbols = { + {Operator_Add, '+'}, + {Operator_Sub, '-'}, + {Operator_Mul, '*'}, + {Operator_Div, '/'}, + }; + if (simple_symbols.count(param->operator_)) { + const std::string &var1 = input_names.at(1); + code << cl_prefix << "FLT4 " << out_name << " = " << var0 << " " << simple_symbols[param->operator_] << " " << var1 + << ";\n"; + } else if (param->operator_ == Operator_Neg) { + code << cl_prefix << "FLT4 " << out_name << " = -" << var0 << ";\n"; + } else if (param->operator_ == Operator_Scale) { + const std::string &var1 = input_names.at(1); + const std::string &var2 = input_names.at(2); + code << cl_prefix << "FLT4 " << out_name << " = " << var0 << " * " << var1 << " + " << var2 << ";\n"; + } else { + if (param->operator_ == Operator_Act_NO_ACTIVATION) { + code << cl_prefix << "FLT4 " << out_name << " = " << var0 << ";\n"; + } else if (param->operator_ == Operator_Act_RELU) { + code << cl_prefix << "FLT4 " << out_name << " = max(" << var0 << ", (FLT4)(0.0f));\n"; + } else if (param->operator_ == Operator_Act_SIGMOID) { + code << cl_prefix << "FLT4 " << out_name << " = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-" << var0 << "));\n"; + } else if (param->operator_ == Operator_Act_RELU6) { + code << cl_prefix << "FLT4 " << out_name << " = clamp(" << var0 << ", (FLT4)(0.0f), (FLT4)(6.0f));\n"; + } else if (param->operator_ == Operator_Act_LEAKY_RELU) { + } else if (param->operator_ == Operator_Act_RELU1) { + code << cl_prefix << "FLT4 " << out_name << " = clamp(" << var0 << ", (FLT4)(0.0f), (FLT4)(1.0f));\n"; + } else if (param->operator_ == Operator_Act_TANH) { + std::string exp0 = GetFormatVarName(); + std::string exp1 = GetFormatVarName(); + code << cl_prefix << "FLT4 " << exp0 << " = exp(" + var0 + ");\n"; + code << cl_prefix << "FLT4 " << exp1 << " = exp(-" + var0 + ");\n"; + code << cl_prefix << "FLT4 " << out_name << " = (" << exp0 << " - " << exp1 << ") / (" << exp0 << " + " << exp1 + << "));\n"; + } + } + + return code.str(); +} + +std::string FusionEltwiseOpenCLKernel::GetFormatVarName(std::string name) { + if (var_names_.count(name)) { + return name; + } + if (name.empty()) { + name = "_var_" + std::to_string(var_names_.size()); + } else { + char c = name.front(); + if (c != '_' && !std::isalpha(c)) { + name = '_' + name; + } + std::replace_if( + name.begin(), name.end(), [](char c) { return !std::isalnum(c); }, '_'); + } + var_names_.insert(name); + return name; +} + +int FusionEltwiseOpenCLKernel::GetTensorIdx(lite::Tensor *in_tensor) { + MS_ASSERT(in_tensor); + auto pos = std::find(in_tensors_.begin(), in_tensors_.end(), in_tensor); + if (pos != in_tensors_.end()) { + return pos - in_tensors_.begin(); + } else { + for (const auto &in_kernel : in_kernels_) { + MS_ASSERT(in_kernel); + MS_ASSERT(in_kernel->in_tensors().size()); + MS_ASSERT(in_kernel->out_tensors().size()); + if (in_kernel->Type() == schema::PrimitiveType_ToFormat) { + if (in_tensor == in_kernel->in_tensors().front()) { + return std::find(in_tensors_.begin(), in_tensors_.end(), in_kernel->out_tensors().front()) - + in_tensors_.begin(); + } + } + } + return 0; + } +} + +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.h index b316c4f5d4..b8eb26aaa2 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.h @@ -17,4 +17,176 @@ #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_FUSION_ELTWISE_H_ #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_FUSION_ELTWISE_H_ +#include +#include +#include +#include +#include +#include +#include "src/runtime/kernel/opencl/opencl_kernel.h" +#include "src/runtime/kernel/opencl/kernel/arithmetic.h" +#include "src/runtime/kernel/opencl/kernel/arithmetic_self.h" +#include "src/runtime/kernel/opencl/kernel/to_format.h" +#include "schema/ops_generated.h" + +using mindspore::schema::ActivationType; +using mindspore::schema::PrimitiveType; +using mindspore::schema::PrimitiveType_MAX; + +namespace mindspore::kernel { + +constexpr schema::PrimitiveType PrimitiveType_FusionEltwise = static_cast(-100); + +enum EltwiseOperator { + // Arithmetic Primitive + Operator_Mul = PrimitiveType_Mul, + Operator_Add = PrimitiveType_Add, + Operator_Sub = PrimitiveType_Sub, + Operator_Div = PrimitiveType_Div, + Operator_LogicalAnd = PrimitiveType_LogicalAnd, + Operator_LogicalOr = PrimitiveType_LogicalOr, + Operator_Maximum = PrimitiveType_Maximum, + Operator_Minimum = PrimitiveType_Minimum, + Operator_FloorDiv = PrimitiveType_FloorDiv, + Operator_FloorMod = PrimitiveType_FloorMod, + Operator_SquaredDifference = PrimitiveType_SquaredDifference, + Operator_Equal = PrimitiveType_Equal, + Operator_NotEqual = PrimitiveType_NotEqual, + Operator_Less = PrimitiveType_Less, + Operator_LessEqual = PrimitiveType_LessEqual, + Operator_Greater = PrimitiveType_Greater, + Operator_GreaterEqual = PrimitiveType_GreaterEqual, + Operator_Eltwise = PrimitiveType_Eltwise, + + // ArithmeticSelf Primitive + Operator_Abs = PrimitiveType_Abs, + Operator_Ceil = PrimitiveType_Ceil, + Operator_Cos = PrimitiveType_Cos, + Operator_Exp = PrimitiveType_Exp, + Operator_Floor = PrimitiveType_Floor, + Operator_Log = PrimitiveType_Log, + Operator_LogicalNot = PrimitiveType_LogicalNot, + Operator_Round = PrimitiveType_Round, + Operator_Rsqrt = PrimitiveType_Rsqrt, + Operator_Sin = PrimitiveType_Sin, + Operator_Neg = PrimitiveType_Neg, + Operator_Sqrt = PrimitiveType_Sqrt, + Operator_Square = PrimitiveType_Square, + + // Other Primitive + Operator_Scale = schema::PrimitiveType_Scale, + + // Activation + Operator_Act_NO_ACTIVATION = schema::ActivationType_NO_ACTIVATION + PrimitiveType_MAX, + Operator_Act_RELU = schema::ActivationType_RELU + PrimitiveType_MAX, + Operator_Act_SIGMOID = schema::ActivationType_SIGMOID + PrimitiveType_MAX, + Operator_Act_RELU6 = schema::ActivationType_RELU6 + PrimitiveType_MAX, + Operator_Act_ELU = schema::ActivationType_ELU + PrimitiveType_MAX, + Operator_Act_LEAKY_RELU = schema::ActivationType_LEAKY_RELU + PrimitiveType_MAX, + Operator_Act_ABS = schema::ActivationType_ABS + PrimitiveType_MAX, + Operator_Act_RELU1 = schema::ActivationType_RELU1 + PrimitiveType_MAX, + Operator_Act_SOFTSIGN = schema::ActivationType_SOFTSIGN + PrimitiveType_MAX, + Operator_Act_SOFTPLUS = schema::ActivationType_SOFTPLUS + PrimitiveType_MAX, + Operator_Act_TANH = schema::ActivationType_TANH + PrimitiveType_MAX, + Operator_Act_SELU = schema::ActivationType_SELU + PrimitiveType_MAX, + Operator_Act_HSWISH = schema::ActivationType_HSWISH + PrimitiveType_MAX, + Operator_Act_HSIGMOID = schema::ActivationType_HSIGMOID + PrimitiveType_MAX, + Operator_Act_THRESHOLDRELU = schema::ActivationType_THRESHOLDRELU + PrimitiveType_MAX, + Operator_Act_LINEAR = schema::ActivationType_LINEAR + PrimitiveType_MAX, + Operator_Act_HARD_TANH = schema::ActivationType_HARD_TANH + PrimitiveType_MAX, + Operator_Act_SIGN = schema::ActivationType_SIGN + PrimitiveType_MAX, + Operator_Act_SWISH = schema::ActivationType_SWISH + PrimitiveType_MAX, +}; + +struct FusionEltwiseParameter { + struct Node_ { + bool is_leaf_; + FusionEltwiseParameter *value_; // if is_leaf_=true, value_ is a Tensor + std::string name_; + Node_(bool is_leaf, FusionEltwiseParameter *value, std::string value_name) + : is_leaf_(is_leaf), value_(value), name_(std::move(value_name)) {} + }; + OpParameter op_parameter_{}; + EltwiseOperator operator_; + std::string name_; + std::vector inputs_; + + FusionEltwiseParameter(EltwiseOperator operator_init, std::string kernel_name, + const std::vector &in_tensors, + const std::map &replace_map = {}) + : operator_(operator_init), name_(std::move(kernel_name)) { + op_parameter_.type_ = PrimitiveType_FusionEltwise; + snprintf(op_parameter_.name_, strlen("FusionEltwiseParameter"), "FusionEltwiseParameter"); + for (int i = 0; i < in_tensors.size(); ++i) { + auto *in_tensor = in_tensors[i]; + if (replace_map.count(in_tensor)) { + auto *pred_param = replace_map.at(in_tensor); + inputs_.emplace_back(false, pred_param, pred_param->name_); + if (reinterpret_cast(in_tensor) == reinterpret_cast(pred_param)) { + this->name_ = pred_param->name_ + "(" + this->name_ + ")"; + } else { + this->name_ = pred_param->name_ + ", " + this->name_; + } + } else { + inputs_.emplace_back(true, reinterpret_cast(in_tensor), "tensor" + std::to_string(i)); + } + } + } + + ~FusionEltwiseParameter() { + for (const auto &input : inputs_) { + if (!input.is_leaf_) { + delete input.value_; + } + } + } +}; + +constexpr EltwiseOperator Activation2Operator(ActivationType act_type) { + return static_cast(act_type + PrimitiveType_MAX); +} + +FusionEltwiseParameter *CreateFusionEltwiseParameter( + LiteKernel *node, const std::map &replace_map = {}); + +bool IsEltwiseAndOperatorSupported(LiteKernel *node); + +class FusionEltwiseOpenCLKernel : public OpenCLKernel { + public: + FusionEltwiseOpenCLKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs) + : OpenCLKernel(parameter, inputs, outputs) {} + + ~FusionEltwiseOpenCLKernel() override { + if (op_parameter_ != nullptr) { + delete op_parameter_; + op_parameter_ = nullptr; + } + } + + int Prepare() override; + int InitWeights() override; + void SetGlobalLocal() override; + void SetConstArgs() override; + int Run() override; + + void ClearParameter() { op_parameter_ = nullptr; } + + public: + std::string Codegen(); + std::string CodegenCore(FusionEltwiseParameter *param, const std::string &out_name = "out", int degree = 0); + std::string GetFormatVarName(std::string name = ""); + int GetTensorIdx(lite::Tensor *in_tensor); + + static inline bool IsScalar(const std::vector &shape) { + return shape.empty() || (shape.size() == 1 && shape.front() == 1); + } + + std::set var_names_; + std::vector scalar_weights_; + std::vector buffer_weights_; +}; + +} // namespace mindspore::kernel + #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_FUSION_ELTWISE_H_ diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc index c607bf2c44..25cbbce922 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc @@ -97,7 +97,7 @@ void PadOpenCLKernel::SetConstArgs() { ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad_before); - ocl_runtime_->SetKernelArg(kernel_, arg_cn, static_cast(param_->constant_value_)); + ocl_runtime_->SetKernelArg(kernel_, arg_cn, param_->constant_value_); local_size_ = {8, 4, 1}; global_size_ = {output.N * output.H, output.W, output.Slice}; AlignGlobalLocal(global_size_, local_size_); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc new file mode 100644 index 0000000000..2508a6b85b --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc @@ -0,0 +1,195 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/opencl/kernel/winograd.h" +#include "src/runtime/kernel/opencl/cl/winograd.cl.inc" + +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; + +namespace mindspore::kernel { + +namespace { +void Align(const std::vector &global, const std::vector &local, cl::NDRange *global_range, + cl::NDRange *local_range) { + *local_range = cl::NDRange(local[0], local[1], local[2]); + *global_range = + cl::NDRange(UP_ROUND(global[0], local[0]), UP_ROUND(global[1], local[1]), UP_ROUND(global[2], local[2])); +} + +std::vector GenerateWinogradFilter(void *src, TypeId dtype, size_t CO, size_t CI) { + constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.0000000000, + 0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000, + 0.0000000000, 0.4999999702, 0.4999999702, 1.9999998808, 1.9999998808, 1.0000000000}; + constexpr float G[] = {1.0000000000, 0.0000000000, 0.0000000000, 1.0000000000, 0.7071067691, 0.4999999702, + 1.0000000000, -0.7071067691, 0.4999999702, 1.0000000000, 1.4142135382, 1.9999998808, + 1.0000000000, -1.4142135382, 1.9999998808, 0.0000000000, 0.0000000000, 1.0000000000}; + auto src_fp32 = reinterpret_cast(src); + auto src_fp16 = reinterpret_cast(src); + std::function access_func; + if (dtype == kNumberTypeFloat32) { + access_func = [=](int idx) { return src_fp32[idx]; }; + } else { + access_func = [=](int idx) { return static_cast(src_fp16[idx]); }; + } + // OHWI -> O66I + std::vector dst(CO * 6 * 6 * CI); + if (src == nullptr) { + return dst; + } + for (int co = 0; co < CO; ++co) { + for (int ci = 0; ci < CI; ++ci) { + float in_vals[9]; + for (int kh = 0; kh < 3; ++kh) { + for (int kw = 0; kw < 3; ++kw) { + const int f_index = ((co * 3 + kh) * 3 + kw) * CI + ci; + in_vals[kh * 3 + kw] = access_func(f_index); + } + } + auto temp_vals = MatrixMultiply(G, in_vals, 6, 3, 3); + auto out_vals = MatrixMultiply(temp_vals.data(), Gt, 6, 3, 6); + for (int kh = 0; kh < 6; ++kh) { + for (int kw = 0; kw < 6; ++kw) { + const int f_index = ((co * 6 + kh) * 6 + kw) * CI + ci; + dst[f_index] = out_vals[kh * 6 + kw]; + } + } + } + } + return dst; +} +} // namespace + +void WinogradOpenCLKernel::BuildKernel() { + std::string program_name = "winograd"; + ocl_runtime_->LoadSource(program_name, GetActDefines() + winograd_source); + ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36"); + ocl_runtime_->BuildKernel(kernel_, program_name, + filter_type_ == MemType::IMG ? "WinogradConv2D_Img" : "WinogradConv2D"); + ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4"); +} + +void WinogradOpenCLKernel::InitFilter() { + auto allocator = ocl_runtime_->GetAllocator(); + + auto ret = DequantWeight(); + if (ret != RET_OK) { + return; + } + + // allocate opencl memory: buffer or image2d + size_t size = 0; + int Ogroup = 2; + if (filter_type_ == MemType::IMG) { + size_t width = 6 * 6 * UP_ROUND(CI_, CI_TILE); + size_t height = CO_SLICES_; + size_t dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; + size = width * height * CO_TILE * sizeof_FLT_; + packed_filter_ = allocator->Malloc(size, {width, height, dtype}); + } else { + size = UP_DIV(CO_SLICES_, Ogroup) * 6 * 6 * CI_SLICES_ * Ogroup * CI_TILE * CO_TILE * sizeof_FLT_; + packed_filter_ = allocator->Malloc(size); + } + + // rearrange filter + auto filter_tensor = in_tensors_.at(1); + auto winograd_filter = GenerateWinogradFilter(filter_tensor->data_c(), filter_tensor->data_type(), CO_, CI_); + + void *src_data = winograd_filter.data(); + auto src_dtype = kNumberTypeFloat32; + auto dst_dtype = use_fp16_ ? kNumberTypeFloat16 : kNumberTypeFloat32; + std::vector tmp(size, 0); + if (filter_type_ == MemType::IMG) { + ConvertFilter(src_data, tmp.data(), src_dtype, dst_dtype, OHWI, OHWIOgroupI4O4, CO_, 6, 6, CI_); + } else { + ConvertFilter(src_data, tmp.data(), src_dtype, dst_dtype, OHWI, OHWIOgroupI4O4, CO_, 6, 6, CI_, Ogroup); + } + + // unmap + if (filter_type_ == MemType::IMG) { + ocl_runtime_->WriteImage(packed_filter_, tmp.data()); + } else { + allocator->MapBuffer(packed_filter_, CL_MAP_WRITE, nullptr, true); + memcpy(packed_filter_, tmp.data(), size); + allocator->UnmapBuffer(packed_filter_); + } + + FreeDequantedWeight(); +} + +void WinogradOpenCLKernel::AllocateMemory() { + auto allocator = ocl_runtime_->GetAllocator(); + size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; + + size_t width = TILE_HW_; + size_t height = CI_SLICES_ * 36; + winograd_mem0_ = allocator->Malloc(width * height * sizeof_FLT_, {width, height, img_dtype}); + + width = TILE_HW_; + height = CO_SLICES_ * 36; + winograd_mem1_ = allocator->Malloc(width * height * sizeof_FLT_, {width, height, img_dtype}); +} + +void WinogradOpenCLKernel::SetConstArgs() { + AllocateMemory(); + + int arg_cn = 1; + cl_int4 input_shape = {batch_size_, OH_, OW_, CI_SLICES_}; // maybe pad=0, so use OH/OW + ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_); + ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, input_shape); + ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn++, TILE_HW_); + ocl_runtime_->SetKernelArg(kernel_4x4to36_, arg_cn, param_->pad_u_); + + arg_cn = 0; + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem0_); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem1_); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_filter_, filter_type_); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, TILE_HW_); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, CI_SLICES_); + ocl_runtime_->SetKernelArg(kernel_, arg_cn, CO_SLICES_); + + arg_cn = 2; + cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_}; + ocl_runtime_->SetKernelArg(kernel_36to4x4_, 0, winograd_mem1_); + ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, MemType::BUF); + ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape); + ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, TILE_HW_); + ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, param_->act_type_); + ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, alpha_); +} + +void WinogradOpenCLKernel::SetGlobalLocal() { + Align({TILE_HW_, 6, CI_SLICES_}, {8, 6, 4}, &global_4x4to36_, &local_4x4to36_); + Align({UP_DIV(TILE_HW_, 2), 36, UP_DIV(CO_SLICES_, 2)}, {8, 6, 2}, &global_range_, &local_range_); + Align({TILE_HW_, 4, CO_SLICES_}, {32, 4, 2}, &global_36to4x4_, &local_36to4x4_); +} + +int WinogradOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " winograd Running!"; + MS_LOG(DEBUG) << "winograd kernel0 Running!"; + ocl_runtime_->SetKernelArg(kernel_4x4to36_, 0, in_tensors_.front()->data_c()); + ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_, nullptr, &event_); + + MS_LOG(DEBUG) << "winograd kernel1 Running!"; + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); + + MS_LOG(DEBUG) << "winograd kernel2 Running!"; + ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c()); + ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_, nullptr, &event_); + return RET_OK; +} + +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h new file mode 100644 index 0000000000..b654537fce --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h @@ -0,0 +1,58 @@ +/** + * Copyright 2019 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_WINOGRAD_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_WINOGRAD_H_ + +#include +#include +#include "src/runtime/kernel/opencl/kernel/conv2d.h" + +namespace mindspore::kernel { + +class WinogradOpenCLKernel : public Conv2DOpenCLKernel { + public: + WinogradOpenCLKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs) + : Conv2DOpenCLKernel(parameter, inputs, outputs) { + filter_type_ = MemType::BUF; + } + + ~WinogradOpenCLKernel() override = default; + + void SetConstArgs() override; + void SetGlobalLocal() override; + int Run() override; + + std::vector GenerateTuningParam() override { return {}; } + int Tune() override { return RET_OK; } + + private: + void BuildKernel() override; + void InitFilter() override; + void AllocateMemory(); + + cl::Kernel kernel_4x4to36_; + cl::Kernel kernel_36to4x4_; + cl::NDRange global_4x4to36_, local_4x4to36_; + cl::NDRange global_36to4x4_, local_36to4x4_; + void *winograd_mem0_{nullptr}; + void *winograd_mem1_{nullptr}; +}; + +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_WINOGRAD_H_ diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc index 0488a92f59..c5e962c579 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc @@ -13,11 +13,650 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - +#include +#include +#include +#include #include "src/runtime/kernel/opencl/opencl_subgraph.h" +#include "src/runtime/kernel/opencl/opencl_kernel.h" +#include "src/runtime/kernel/opencl/kernel/arithmetic.h" +#include "src/runtime/kernel/opencl/kernel/conv2d.h" +#include "src/runtime/kernel/opencl/kernel/fusion_eltwise.h" +#include "src/runtime/kernel/opencl/utils.h" +#include "src/runtime/opencl/opencl_executor.h" +#include "include/errorcode.h" +#include "schema/ops_generated.h" +#include "src/common/utils.h" +#include "nnacl/conv_parameter.h" +#include "nnacl/pad_parameter.h" +#include "nnacl/pooling_parameter.h" +#include "nnacl/fp32/activation_fp32.h" +#include "nnacl/matmul_parameter.h" +#include "nnacl/scale.h" + +using mindspore::schema::ActivationType; +using mindspore::schema::ActivationType_LEAKY_RELU; +using mindspore::schema::ActivationType_NO_ACTIVATION; +using mindspore::schema::ActivationType_RELU; +using mindspore::schema::ActivationType_RELU6; +using mindspore::schema::ActivationType_TANH; +using mindspore::schema::PrimitiveType; +using mindspore::schema::PrimitiveType_Activation; +using mindspore::schema::PrimitiveType_Eltwise; +using mindspore::schema::PrimitiveType_NONE; namespace mindspore::kernel { -void OpenCLSubGraph::Fusion() {} +namespace { + +template +inline bool AIsInB(const T0 *a, const T1 *b) { + MS_ASSERT(a); + MS_ASSERT(b); + return std::find(b->begin(), b->end(), a) != b->end(); +} + +inline bool PredIs(const LiteKernel *node, PrimitiveType type, std::vector *nodes) { + MS_ASSERT(node); + if (node->in_kernels().size() == 1) { + LiteKernel *pred = node->in_kernels().front(); + MS_ASSERT(pred); + if (AIsInB(pred, nodes) && pred->Type() == type && pred->out_kernels().size() == 1) { + MS_ASSERT(pred->out_kernels().front() == node); + return true; + } + } + return false; +} + +inline std::string GetTypeName(const LiteKernel *node) { + MS_ASSERT(node); + if (node->Type() == PrimitiveType_FusionEltwise) { + return "FusionEltwise"; + } else { + return schema::EnumNamePrimitiveType(node->Type()); + } +} + +inline bool NC_N11C(const LiteKernel *node) { + MS_ASSERT(node); + if (node->in_tensors().empty() || node->out_tensors().empty()) { + return false; + } else { + MS_ASSERT(node->in_tensors().front()); + MS_ASSERT(node->out_tensors().front()); + auto input_shape = node->in_tensors().front()->shape(); + auto output_shape = node->out_tensors().front()->shape(); + return input_shape.size() == 2 && output_shape.size() == 4 && + output_shape == std::vector({input_shape[0], 1, 1, input_shape[1]}); + } +} + +inline bool N11C_NC(const LiteKernel *node) { + MS_ASSERT(node); + if (node->in_tensors().empty() || node->out_tensors().empty()) { + return false; + } else { + MS_ASSERT(node->in_tensors().front()); + MS_ASSERT(node->out_tensors().front()); + auto input_shape = node->in_tensors().front()->shape(); + auto output_shape = node->out_tensors().front()->shape(); + return input_shape.size() == 4 && output_shape.size() == 2 && + input_shape == std::vector({output_shape[0], 1, 1, output_shape[1]}); + } +} + +inline bool NC11_NC(const LiteKernel *node) { + if (node->in_tensors().empty() || node->out_tensors().empty()) { + return false; + } else { + MS_ASSERT(node->in_tensors().front()); + MS_ASSERT(node->out_tensors().front()); + auto input_shape = node->in_tensors().front()->shape(); + auto output_shape = node->out_tensors().front()->shape(); + return input_shape.size() == 4 && output_shape.size() == 2 && + input_shape == std::vector({output_shape[0], output_shape[1], 1, 1}); + } +} + +template +std::vector RemoveDuplicationsButKeepOrder(const std::vector &vec) { + std::vector ret; + std::set s; + for (auto *x : vec) { + if (0 == s.count(x)) { + ret.push_back(x); + s.insert(x); + } + } + return ret; +} + +void Merge(LiteKernel *a, LiteKernel *b, bool remove_a) { + MS_ASSERT(a); + MS_ASSERT(b); + if (remove_a) { // pred->tensor0->a->tensor1->b: remove a tensor1 + // update pred out_kernels: a.in_kernels.out_kernels.replace(a,b) + for (auto *pred : a->in_kernels()) { + MS_ASSERT(pred); + auto pred_out_kernels = pred->out_kernels(); + std::replace_if( + pred_out_kernels.begin(), pred_out_kernels.end(), [&](LiteKernel *x) { return x == a; }, b); + pred->set_out_kernels(RemoveDuplicationsButKeepOrder(pred_out_kernels)); + } + + // update b in_tensors: b.in_tensors.replace(a.out_tensors[0], a.in_tensors) + auto b_in_tensors = b->in_tensors(); + for (int i = 0; i < b_in_tensors.size(); ++i) { + if (b_in_tensors[i] == a->out_tensors().front()) { + // reshape: 2nd input tensor is removed + if (a->Type() == schema::PrimitiveType_Reshape) { + b_in_tensors[i] = a->in_tensors().front(); + b->set_in_tensors(b_in_tensors); + } else { + b_in_tensors.erase(b_in_tensors.begin() + i); + b_in_tensors.insert(b_in_tensors.begin() + i, a->in_tensors().begin(), a->in_tensors().end()); + b->set_in_tensors(RemoveDuplicationsButKeepOrder(b_in_tensors)); + } + break; + } + } + + // update b in_kernels: b.in_kernels.replace(a, a.in_kernels) + auto b_in_kernels = b->in_kernels(); + for (int i = 0; i < b_in_kernels.size(); ++i) { + if (a == b_in_kernels[i]) { + b_in_kernels.erase(b_in_kernels.begin() + i); + b_in_kernels.insert(b_in_kernels.begin() + i, a->in_kernels().begin(), a->in_kernels().end()); + b->set_in_kernels(RemoveDuplicationsButKeepOrder(b_in_kernels)); + break; + } + } + } else { // a->tensor1->b->tensor2->succ: remove tensor1 b + // update a.out_tensors + a->set_out_tensors(b->out_tensors()); + + // update a.out_kernels + a->set_out_kernels(b->out_kernels()); + + // update succ in_kernels + for (auto *succ : b->out_kernels()) { + MS_ASSERT(succ); + auto succ_in_kernels = succ->in_kernels(); + std::replace_if( + succ_in_kernels.begin(), succ_in_kernels.end(), [&](LiteKernel *x) { return x == b; }, a); + succ->set_in_kernels(RemoveDuplicationsButKeepOrder(succ_in_kernels)); + } + } +} + +inline void MergeRemoveA(LiteKernel *a, LiteKernel *b, std::set *removed_set, + bool do_check_specs = true) { + MS_ASSERT(a); + MS_ASSERT(b); + MS_ASSERT(removed_set); + Merge(a, b, true); + removed_set->insert(a); + if (do_check_specs && reinterpret_cast(b)->CheckSpecs() != RET_OK) { + MS_LOG(ERROR) << "fusion kernel CheckSpecs() error: kernel name is " << b->name(); + } +} + +inline void MergeRemoveB(LiteKernel *a, LiteKernel *b, std::set *removed_set) { + MS_ASSERT(a); + MS_ASSERT(b); + MS_ASSERT(removed_set); + Merge(a, b, false); + removed_set->insert(b); + if (reinterpret_cast(a)->CheckSpecs() != RET_OK) { + MS_LOG(ERROR) << "fusion kernel CheckSpecs() error: kernel name is " << a->name(); + } +} + +// Pad + Conv2D +// Pad + DepthwiseConv2D +// Pad + DeConv2D +// Pad + Pooling +template +void TryMergePad(LiteKernel *node, std::set *removed_set) { + MS_ASSERT(node); + MS_ASSERT(removed_set); + LiteKernel *pad = node->in_kernels().front(); + MS_ASSERT(pad); + if (pad->in_tensors().front()->shape().size() != 4) { + return; + } + auto *pad_param = reinterpret_cast(reinterpret_cast(pad)->GetParameter()); + MS_ASSERT(pad_param); + if (pad_param->pad_mode_ != schema::PaddingMode::PaddingMode_CONSTANT || + std::fabs(pad_param->constant_value_) > 1e-5) { + return; + } + + auto *conv_param = reinterpret_cast(reinterpret_cast(node)->GetParameter()); + MS_ASSERT(conv_param); + conv_param->pad_u_ += pad_param->paddings_[2]; + conv_param->pad_d_ += pad_param->paddings_[3]; + conv_param->pad_l_ += pad_param->paddings_[4]; + conv_param->pad_r_ += pad_param->paddings_[5]; + + MergeRemoveA(pad, node, removed_set); + MS_LOG(DEBUG) << "Merge Pad and " + GetTypeName(node) + " success"; +} + +// Conv2D + Reshape(N11C->NC) +void TryMergeConvReshape(LiteKernel *reshape, std::set *removed_set) { + MS_ASSERT(reshape); + MS_ASSERT(removed_set); + if (N11C_NC(reshape)) { + LiteKernel *conv = reshape->in_kernels().front(); + MS_ASSERT(conv); + MergeRemoveB(conv, reshape, removed_set); + MS_LOG(DEBUG) << "Merge Conv2D and Reshape(N11C->NC) success"; + } +} + +// FullConnection + Reshape(NC->N11C or N11C->NC) +void TryMergeFcReshape(LiteKernel *reshape, std::set *removed_set) { + MS_ASSERT(reshape); + MS_ASSERT(removed_set); + bool NC_N11C_flag = NC_N11C(reshape); + if (NC_N11C_flag || N11C_NC(reshape)) { + LiteKernel *fc = reshape->in_kernels().front(); + MS_ASSERT(fc); + MergeRemoveB(fc, reshape, removed_set); + MS_LOG(DEBUG) << "Merge FullConnection and Reshape" + (NC_N11C_flag ? std::string("(NC->N11C)") : "(N11C->NC)") + + " success"; + } +} + +// Reshape(NC11->NC) + FullConnection +// Reshape(NC->N11C) + FullConnection +void TryMergeReshapeFc(LiteKernel *fc, std::set *removed_set) { + MS_ASSERT(fc); + MS_ASSERT(removed_set); + LiteKernel *reshape = fc->in_kernels().front(); + MS_ASSERT(reshape); + bool NC11_NC_flag = NC11_NC(reshape); + if (NC11_NC_flag || NC_N11C(reshape)) { + MergeRemoveA(reshape, fc, removed_set); + MS_LOG(DEBUG) << "Merge Reshape" + (NC11_NC_flag ? std::string("(NC11->NC)") : "(NC->N11C)") + + " and FullConnection success"; + } +} + +// Arithmetic(NO_ACTIVATION) + Activation(RELU/RELU6) +void TryMergeArithmeticAct(LiteKernel *act, std::set *removed_set) { + MS_ASSERT(act); + MS_ASSERT(removed_set); + LiteKernel *arithmetic = act->in_kernels().front(); + MS_ASSERT(arithmetic); + auto *arithmetic_param = + reinterpret_cast(reinterpret_cast(arithmetic)->GetParameter()); + auto *act_param = reinterpret_cast(reinterpret_cast(act)->GetParameter()); + MS_ASSERT(arithmetic_param); + MS_ASSERT(act_param); + + if (arithmetic_param->activation_type_ == ActivationType_NO_ACTIVATION && + (act_param->type_ == ActivationType_RELU || act_param->type_ == ActivationType_RELU6)) { + arithmetic_param->activation_type_ = act_param->type_; + MergeRemoveB(arithmetic, act, removed_set); + MS_LOG(DEBUG) << "Merge " + GetTypeName(arithmetic) + "(NO_ACTIVATION) and Activation(RELU or RELU6) success"; + } +} + +// Conv2D(NO_ACTIVATION) + Activation(RELU/RELU6/TANH) +// FullConnection(NO_ACTIVATION) + Activation(RELU/RELU6/TANH) +template +void TryMergeActivation(LiteKernel *act, std::set *removed_set) { + MS_ASSERT(node); + MS_ASSERT(removed_set); + auto *act_param = reinterpret_cast(reinterpret_cast(act)->GetParameter()); + LiteKernel *node = act->in_kernels().front(); + auto *param = reinterpret_cast(reinterpret_cast(node)->GetParameter()); + MS_ASSERT(param); + if (param->act_type_ == ActType_No) { + param->act_type_ = static_cast(act_param->type_); + std::string act_name; + if (act_param->type_ == ActivationType_RELU) { + act_name = "RELU"; + } else if (act_param->type_ == ActivationType_RELU6) { + act_name = "RELU6"; + } else if (act_param->type_ == ActivationType_TANH) { + act_name = "TANH"; + } + MergeRemoveB(node, act, removed_set); + MS_LOG(DEBUG) << "Merge " + GetTypeName(node) + "(NO_ACTIVATION) and Activation(" + act_name + ") success"; + } +} + +// Conv2D(NO_ACTIVATION) + PReLU(weight is scalar) +void TryMergeConvPReLU(LiteKernel *prelu, std::set *removed_set) { + MS_ASSERT(prelu); + MS_ASSERT(removed_set); + if (prelu->in_tensors().size() != 2) { + return; + } + auto *prelu_weight = prelu->in_tensors().at(1); + bool shape_is_valid = + prelu_weight->IsScalar() || (prelu_weight->shape().size() == 1 && prelu_weight->shape().front() == 1); + if (!shape_is_valid) { + return; + } + if (prelu_weight->data_type() != kNumberTypeFloat32) { + return; + } + + LiteKernel *conv = prelu->in_kernels().front(); + auto *param = reinterpret_cast(reinterpret_cast(conv)->GetParameter()); + MS_ASSERT(param); + if (param->act_type_ == ActType_No) { + param->act_type_ = static_cast(ActivationType_LEAKY_RELU); + reinterpret_cast(conv)->alpha_ = *reinterpret_cast(prelu_weight->data_c()); + MergeRemoveB(conv, prelu, removed_set); + MS_LOG(DEBUG) << "Merge Conv2D(NO_ACTIVATION) and PReLU(weight is scalar) success"; + } +} + +int TryFusionConvScaleWeight(LiteKernel *conv_kernel, LiteKernel *scale_kernel) { + MS_ASSERT(conv_kernel); + MS_ASSERT(scale_kernel); + auto *scale_param = + reinterpret_cast(reinterpret_cast(scale_kernel)->GetParameter()); + MS_ASSERT(scale_param); + MS_ASSERT(conv_kernel->in_tensors().size() >= 2); + auto *filter = conv_kernel->in_tensors().at(1); + auto *bias = conv_kernel->in_tensors().size() == 3 ? conv_kernel->in_tensors().at(2) : nullptr; + auto *scale = scale_kernel->in_tensors().at(1); + auto *offset = scale_kernel->in_tensors().at(2); + MS_ASSERT(filter); + MS_ASSERT(bias); + MS_ASSERT(scale); + MS_ASSERT(offset); + + if (scale_kernel->in_tensors().size() != 3) { + return RET_ERROR; + } + if (scale->shape().size() != 1 || scale->shape().at(0) != filter->shape().back() || + scale->shape() != offset->shape()) { + return RET_ERROR; + } + if (!(scale_param->axis_ == -1 || scale_param->axis_ == 3)) { + return RET_ERROR; + } + if (filter->data_type() != kNumberTypeFloat32 || (bias && bias->data_type() != kNumberTypeFloat32) || + scale->data_type() != kNumberTypeFloat32 || offset->data_type() != kNumberTypeFloat32) { + return RET_ERROR; + } + + // update filter: filter*=scale + MS_ASSERT(filter->shape().size() == 4); + int CI = filter->shape()[0]; + int KH = filter->shape()[1]; + int KW = filter->shape()[2]; + int CO = filter->shape()[3]; + auto *filter_data = reinterpret_cast(filter->data_c()); + auto *scale_data = reinterpret_cast(scale->data_c()); + for (int i = 0; i < CI * KH * KW * CO; ++i) { + filter_data[i] *= scale_data[i % CO]; + } + + // update bias: bias=bias*scale+offset + if (bias != nullptr) { + auto *bias_data = reinterpret_cast(bias->data_c()); + auto *offset_data = reinterpret_cast(offset->data_c()); + for (int co = 0; co < CO; ++co) { + bias_data[co] *= scale_data[co]; + bias_data[co] += offset_data[co]; + } + } else { // if deconv dont't have bias, let scale's offset be deconv's bias + auto tmp = conv_kernel->in_tensors(); + tmp.push_back(offset); + conv_kernel->set_in_tensors(tmp); + } + return RET_OK; +} + +// DeConv2D + Scale (can't both has activation) +void TryMergeDeconvScale(LiteKernel *scale, std::set *removed_set) { + MS_ASSERT(scale); + MS_ASSERT(removed_set); + LiteKernel *deconv = scale->in_kernels().front(); + MS_ASSERT(deconv); + + // check act_type_ + auto *deconv_param = reinterpret_cast(reinterpret_cast(deconv)->GetParameter()); + auto *scale_param = reinterpret_cast(reinterpret_cast(scale)->GetParameter()); + MS_ASSERT(deconv_param); + MS_ASSERT(scale_param); + if (deconv_param->act_type_ == ActType_No) { + if (!(scale_param->activation_type_ == ActivationType_NO_ACTIVATION || + scale_param->activation_type_ == ActivationType_RELU || + scale_param->activation_type_ == ActivationType_RELU6)) { + return; + } + } else if (deconv_param->act_type_ == ActType_Relu || deconv_param->act_type_ == ActType_Relu6) { + if (deconv_param->act_type_ != ActType_No) { + return; + } + } else { + return; + } + + // fusion weight + if (TryFusionConvScaleWeight(deconv, scale) == RET_ERROR) { + return; + } + + // update act_type_ + if (deconv_param->act_type_ == ActType_No) { + deconv_param->act_type_ = static_cast(scale_param->activation_type_); + } + + MergeRemoveB(deconv, scale, removed_set); + MS_LOG(DEBUG) << "Merge DeConv2D and Scale success"; +} + +void CreateEltwiseKernelReplaceOld(FusionEltwiseParameter *param, LiteKernel *old, std::vector *nodes, + std::set *removed_set) { + MS_ASSERT(param); + MS_ASSERT(old); + MS_ASSERT(nodes); + MS_ASSERT(removed_set); + auto *eltwise = new (std::nothrow) + FusionEltwiseOpenCLKernel(reinterpret_cast(param), old->in_tensors(), old->out_tensors()); + if (eltwise == nullptr) { + MS_LOG(ERROR) << "create FusionEltwiseOpenCLKernel error."; + return; + } + eltwise->set_name("FusionEltwise: " + param->name_); + eltwise->set_in_kernels(old->in_kernels()); + eltwise->set_out_kernels(old->out_kernels()); + + for (auto *pred : old->in_kernels()) { + MS_ASSERT(pred); + auto tmp = pred->out_kernels(); + std::replace_if( + tmp.begin(), tmp.end(), [&](LiteKernel *x) { return x == old; }, eltwise); + pred->set_out_kernels(tmp); + } + + for (auto *succ : old->out_kernels()) { + MS_ASSERT(succ); + auto tmp = succ->in_kernels(); + std::replace_if( + tmp.begin(), tmp.end(), [&](LiteKernel *x) { return x == old; }, eltwise); + succ->set_in_kernels(tmp); + } + std::replace(nodes->begin(), nodes->end(), old, reinterpret_cast(eltwise)); + removed_set->insert(old); +} + +// Eltwise + Eltwise +int TryMergeEltwiseEltwise(LiteKernel *node, std::vector *nodes, std::set *removed_set) { + MS_ASSERT(node); + MS_ASSERT(nodes); + MS_ASSERT(removed_set); + // node must be eltwise-like op + if (!IsEltwiseAndOperatorSupported(node)) { + return RET_ERROR; + } + + // preds must contain eltwise-like op + const std::vector preds = node->in_kernels(); + std::set pred_eltwises; + std::map pred_params; + for (LiteKernel *pred : preds) { + MS_ASSERT(pred); + if (AIsInB(pred, nodes) && IsEltwiseAndOperatorSupported(pred) && pred->out_kernels().size() == 1) { + auto *tensor = pred->out_tensors().front(); + MS_ASSERT(pred->out_kernels().front() == node); + MS_ASSERT(AIsInB(tensor, node.in_tensors())); + pred_eltwises.insert(pred); + // create FusionEltwiseParameter for this pred eltwise + auto param = CreateFusionEltwiseParameter(pred); + pred_params.emplace(tensor, param); + } + } + if (pred_eltwises.empty()) { + return RET_ERROR; + } + + // 1. create FusionEltwiseParameter for this node + FusionEltwiseParameter *param = CreateFusionEltwiseParameter(node, pred_params); + MS_ASSERT(param); + // 2. merge pred eltwise op + for (LiteKernel *pred_eltwise : pred_eltwises) { + MergeRemoveA(pred_eltwise, node, removed_set, false); + } + // 3. create FusionFusionEltwiseOpenCLKernel and replace old kernel by new + CreateEltwiseKernelReplaceOld(param, node, nodes, removed_set); + + MS_LOG(DEBUG) << "Merge Eltwise and Eltwise success: " << param->name_; + return RET_OK; +} + +} // namespace + +void OpenCLSubGraph::Fusion() { + MS_LOG(DEBUG) << "start Fusion"; + + std::vector input_nodes; + for (auto *node : nodes_) { + if (std::any_of(node->in_tensors().begin(), node->in_tensors().end(), + [&](lite::Tensor *tensor) { return AIsInB(tensor, &in_tensors_); })) { + input_nodes.push_back(node); + } + } + + auto cmp = [&](LiteKernel *a, LiteKernel *b) { + return std::find(nodes_.begin(), nodes_.end(), a) > std::find(nodes_.begin(), nodes_.end(), b); + }; + std::priority_queue, decltype(cmp)> q(cmp, input_nodes); + std::set qset(input_nodes.begin(), input_nodes.end()); + std::set removed_set; + while (!q.empty()) { + LiteKernel *node = q.top(); + MS_ASSERT(node); + q.pop(); + qset.erase(node); + if (AIsInB(node, &removed_set)) { + continue; + } + + // insert node->out_kernels to q only if succ + // 1. not in q + // 2. not be removed + // 3. in nodes_ + for (auto *succ : node->out_kernels()) { + if (!AIsInB(succ, &qset) && !AIsInB(succ, &removed_set) && AIsInB(succ, &nodes_)) { + q.push(succ); + qset.insert(succ); + } + } + + // do element-wise fusion, like mul+add, mul+add+relu + if (TryMergeEltwiseEltwise(node, &nodes_, &removed_set) == RET_OK) { + continue; + } + + // do special fusion, like pad+conv2d, fc+reshape + switch (node->Type()) { + case schema::PrimitiveType_Conv2D: + case schema::PrimitiveType_DepthwiseConv2D: + case schema::PrimitiveType_DeConv2D: { + if (PredIs(node, schema::PrimitiveType_Pad, &nodes_)) { + TryMergePad(node, &removed_set); + } + break; + } + case schema::PrimitiveType_Pooling: { + if (PredIs(node, schema::PrimitiveType_Pad, &nodes_)) { + TryMergePad(node, &removed_set); + } + break; + } + case schema::PrimitiveType_Reshape: { + if (PredIs(node, schema::PrimitiveType_FullConnection, &nodes_)) { + TryMergeFcReshape(node, &removed_set); + } else if (PredIs(node, schema::PrimitiveType_Conv2D, &nodes_)) { + TryMergeConvReshape(node, &removed_set); + } + break; + } + case schema::PrimitiveType_FullConnection: { + if (PredIs(node, schema::PrimitiveType_Reshape, &nodes_)) { + TryMergeReshapeFc(node, &removed_set); + } + break; + } + case schema::PrimitiveType_Activation: { + // try merge Conv2D/FC(without act) + RELU/RELU6/TANH + auto *param = reinterpret_cast(reinterpret_cast(node)->GetParameter()); + MS_ASSERT(param); + if (param->type_ == ActivationType_RELU || param->type_ == ActivationType_RELU6 || + param->type_ == ActivationType_TANH) { + if (PredIs(node, schema::PrimitiveType_Conv2D, &nodes_)) { + TryMergeActivation(node, &removed_set); + break; + } else if (PredIs(node, schema::PrimitiveType_FullConnection, &nodes_)) { + TryMergeActivation(node, &removed_set); + break; + } + } + if (std::any_of(ArithmeticPrimitives.begin(), ArithmeticPrimitives.end(), + [&](schema::PrimitiveType type) { return PredIs(node, type, &nodes_); })) { + TryMergeArithmeticAct(node, &removed_set); + } + break; + } + case schema::PrimitiveType_PReLU: { + if (PredIs(node, schema::PrimitiveType_Conv2D, &nodes_)) { + TryMergeConvPReLU(node, &removed_set); + break; + } + break; + } + case schema::PrimitiveType_Scale: { + if (PredIs(node, schema::PrimitiveType_DeConv2D, &nodes_)) { + TryMergeDeconvScale(node, &removed_set); + break; + } + break; + } + default: + break; + } + } + + for (auto kernel : removed_set) { + delete kernel; + } + MS_LOG(DEBUG) << "number of kernels(before fusion): " << nodes_.size(); + nodes_.erase( + std::remove_if(nodes_.begin(), nodes_.end(), [&](LiteKernel *node) { return AIsInB(node, &removed_set); }), + nodes_.end()); + MS_LOG(DEBUG) << "number of kernels(after fusion) : " << nodes_.size(); +} } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc index 176cba173c..d7ef223208 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc @@ -71,6 +71,57 @@ int OpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { return RET_OK; } +void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) { + printf("%-30s", name().c_str()); + if (out_tensors().empty()) { + return; + } + auto *tensor = out_tensors()[0]; + auto mem_type = GetMemType(); + if (tensor == nullptr || tensor->data_c() == nullptr) { + return; + } + + GpuTensorInfo img_info(tensor); + auto size = mem_type == lite::opencl::MemType::BUF ? img_info.OriginSize : img_info.Image2DSize; + std::vector data(size); + auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); + auto runtime = runtime_wrapper.GetInstance(); + auto allocator = runtime->GetAllocator(); + runtime->SyncCommandQueue(); + if (mem_type == lite::opencl::MemType::BUF) { + allocator->MapBuffer(tensor->data_c(), CL_MAP_READ, nullptr, true); + memcpy(data.data(), tensor->data_c(), img_info.OriginSize); + allocator->UnmapBuffer(tensor->data_c()); + } else { + runtime->ReadImage(tensor->data_c(), data.data()); + } + + printf("shape=("); + auto shape = tensor->shape(); + for (int i = 0; i < shape.size(); ++i) { + printf("%4d", shape[i]); + if (i + 1 < shape.size()) { + printf(","); + } + } + printf(") "); + + auto total_num = mem_type == lite::opencl::MemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num; + for (int i = 0; i < print_num && i < total_num; ++i) { + if (tensor->data_type() == kNumberTypeFloat16) { + printf("%d %7.3f | ", i, reinterpret_cast(data.data())[i]); + } else { + printf("%d %7.3f | ", i, reinterpret_cast(data.data())[i]); + } + } + printf("\n"); + + if (!out_file.empty()) { + (void)WriteToBin(out_file, data.data(), data.size()); + } +} + int OpenCLKernel::PostProcess() { for (auto *output : this->out_tensors()) { MS_ASSERT(output != nullptr); @@ -213,8 +264,10 @@ int OpenCLKernel::DequantWeight() { #ifdef ENABLE_ARM64 if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt8) { dequant_weight = kernel::DequantUtil::DequantData(weight_tensor); + weight_tensor->set_data_type(kNumberTypeFloat16); } else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt16) { dequant_weight = kernel::DequantUtil::DequantData(weight_tensor); + weight_tensor->set_data_type(kNumberTypeFloat16); } else { set_flag = false; } @@ -224,8 +277,10 @@ int OpenCLKernel::DequantWeight() { } else { if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt8) { dequant_weight = kernel::DequantUtil::DequantData(weight_tensor); + weight_tensor->set_data_type(kNumberTypeFloat32); } else if (in_tensors_.at(kWeightIndex)->data_type() == kNumberTypeInt16) { dequant_weight = kernel::DequantUtil::DequantData(weight_tensor); + weight_tensor->set_data_type(kNumberTypeFloat32); } else { set_flag = false; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h index b822616c88..69ff966f6e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -26,6 +26,7 @@ #include "include/errorcode.h" #include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/kernel/arm/base/dequant.h" +#include "src/runtime/kernel/opencl/utils.h" using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; @@ -181,6 +182,7 @@ class OpenCLKernel : public LiteKernel { virtual int Tune(); int GetImageSize(size_t idx, std::vector *img_size); + void PrintOutput(int print_num = 10, const std::string &out_file = ""); lite::opencl::MemType GetMemType() { return out_mem_type_; } void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; } OpParameter *GetParameter() { return op_parameter_; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.cc b/mindspore/lite/src/runtime/kernel/opencl/utils.cc index fcf75ff30d..57cba111b2 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc @@ -19,12 +19,8 @@ #include #include #include "src/kernel_registry.h" -#include "src/runtime/opencl/opencl_runtime.h" -#include "src/runtime/kernel/opencl/opencl_kernel.h" #include "src/common/file_utils.h" -using mindspore::lite::KernelRegistrar; -using mindspore::lite::opencl::MemType; using mindspore::schema::ActivationType_LEAKY_RELU; using mindspore::schema::ActivationType_RELU; using mindspore::schema::ActivationType_RELU6; @@ -298,65 +294,6 @@ int WriteToBin(const std::string &file_path, void *data, size_t size) { return 0; } -void PrintTensor(const lite::Tensor *tensor, MemType mem_type, int n, const std::string &out_file) { - if (tensor == nullptr || tensor->data_c() == nullptr) { - return; - } - - GpuTensorInfo img_info(tensor); - auto size = mem_type == MemType::BUF ? img_info.OriginSize : img_info.Image2DSize; - std::vector data(size); - auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); - auto runtime = runtime_wrapper.GetInstance(); - auto allocator = runtime->GetAllocator(); - runtime->SyncCommandQueue(); - allocator->MapBuffer(tensor->data_c(), CL_MAP_READ, nullptr, true); - if (mem_type == MemType::BUF) { - memcpy(data.data(), tensor->data_c(), img_info.OriginSize); - } else { - auto row_size = img_info.width * img_info.FLT4_size; - for (int i = 0; i < img_info.height; ++i) { - memcpy(reinterpret_cast(data.data()) + i * row_size, - static_cast(tensor->data_c()) + i * img_info.RowPitch(), row_size); - } - } - allocator->UnmapBuffer(tensor->data_c()); - - printf("shape=("); - auto shape = tensor->shape(); - for (int i = 0; i < shape.size(); ++i) { - printf("%4d", shape[i]); - if (i + 1 < shape.size()) { - printf(","); - } - } - printf(") "); - - auto num = mem_type == MemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num; - for (int i = 0; i < n && i < num; ++i) { - if (tensor->data_type() == kNumberTypeFloat16) { - printf("%d %7.3f | ", i, reinterpret_cast(data.data())[i]); - } else { - printf("%d %7.3f | ", i, reinterpret_cast(data.data())[i]); - } - } - printf("\n"); - - if (!out_file.empty()) { - (void)WriteToBin(out_file, data.data(), data.size()); - } -} - -void PrintKernelOutput(OpenCLKernel *kernel, int n, const std::string &out_file) { - if (kernel == nullptr) { - return; - } - printf("%-30s", kernel->name().c_str()); - if (!kernel->out_tensors().empty()) { - PrintTensor(kernel->out_tensors()[0], kernel->GetMemType(), n, out_file); - } -} - std::vector GetNHWCShape(const std::vector &tensor_shape) { int n, h, w, c; n = h = w = c = 1; diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h index fbcf5552eb..45e7fe96dd 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.h +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h @@ -25,8 +25,6 @@ #include "nnacl/op_base.h" #include "src/lite_kernel.h" #include "src/common/utils.h" -#include "src/runtime/opencl/opencl_runtime.h" -#include "src/runtime/kernel/opencl/opencl_kernel.h" namespace mindspore::lite { kernel::LiteKernel *GetOpenCLKernel(const std::vector &in_tensors, const std::vector &out_tensors, @@ -59,11 +57,6 @@ std::string CLErrorCode(cl_int error_code); int WriteToBin(const std::string &file_path, void *data, size_t size); -void PrintTensor(const lite::Tensor *tensor, lite::opencl::MemType mem_type, int n = 10, - const std::string &out_file = ""); - -void PrintKernelOutput(OpenCLKernel *kernel, int n = 10, const std::string &out_file = ""); - std::vector GetNHWCShape(const std::vector &tensor_shape); std::vector GetImage2dShapeFromNHWC(const std::vector &tensor_shape, schema::Format format); @@ -154,38 +147,6 @@ std::vector MatrixMultiply(const T A[], const T B[], int M, int N, int K) { return C; } -template -void ConvertConvWeight4DTo7D(void *src, void *dst, size_t CO, size_t KH, size_t KW, size_t CI, size_t OGroup = 1, - const size_t CI_TILE = 4, const size_t CO_TILE = 4) { - MS_ASSERT(src); - MS_ASSERT(dst); - MS_ASSERT(CI_TILE); - MS_ASSERT(CO_TILE); - MS_ASSERT(OGroup); - if (CO_TILE == 0 || CI_TILE == 0) return; - auto origin_weight = reinterpret_cast(src); - auto packed_weight = reinterpret_cast(dst); - auto CI_SLICES = UP_DIV(CI, CI_TILE); - for (size_t co = 0, src_idx = 0; co < CO; ++co) { - for (size_t kh = 0; kh < KH; ++kh) { - for (size_t kw = 0; kw < KW; ++kw) { - for (size_t ci = 0; ci < CI; ++ci) { - size_t co_outer = co / (CO_TILE * OGroup); - size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE; - size_t co_inner = co % CO_TILE; - size_t ci_outer = ci / CI_TILE; - size_t ci_inner = ci % CI_TILE; - size_t dst_idx = - (((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) * - CO_TILE + - co_inner; - packed_weight[dst_idx] = static_cast(origin_weight[src_idx++]); - } - } - } - } -} - } // namespace mindspore::kernel #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_UTILS_H_ diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc index 2d32db5d6b..cd77c43b31 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc @@ -507,6 +507,37 @@ bool OpenCLRuntime::BuildProgram(const std::string &build_options, const cl::Pro return true; } +int OpenCLRuntime::ReadOrWriteImage(void *buffer, void *data, bool is_read) { + cl::CommandQueue *command_queue = profiling_ ? profiling_command_queue_ : default_command_queue_; + auto *image = reinterpret_cast(allocator_->GetImage(buffer)); + if (image == nullptr) { + MS_LOG(WARNING) << "Can't get Image2D for " << buffer; + return RET_ERROR; + } + std::vector img_size; + int ret = allocator_->GetImageSize(buffer, &img_size); + if (ret != RET_OK) { + MS_LOG(WARNING) << "Can't get GetImageSize for " << buffer; + return RET_ERROR; + } + cl::array origin = {0, 0, 0}; + cl::array region = {img_size[0], img_size[1], 1}; + if (is_read) { + ret = command_queue->enqueueReadImage(*image, true, origin, region, 0, 0, data, nullptr, nullptr); + } else { + ret = command_queue->enqueueWriteImage(*image, true, origin, region, 0, 0, data, nullptr, nullptr); + } + if (ret != CL_SUCCESS) { + MS_LOG(ERROR) << CLErrorCode(ret); + return RET_ERROR; + } + return RET_OK; +} + +int OpenCLRuntime::ReadImage(void *buffer, void *dst_data) { return ReadOrWriteImage(buffer, dst_data, true); } + +int OpenCLRuntime::WriteImage(void *buffer, void *src_data) { return ReadOrWriteImage(buffer, src_data, false); } + bool OpenCLRuntime::CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue, bool sync) const { if (command_queue == nullptr) { diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.h b/mindspore/lite/src/runtime/opencl/opencl_runtime.h index d8704d09a7..3af20f5c69 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h @@ -119,6 +119,9 @@ class OpenCLRuntime { const std::set &build_options = {}); int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, cl::CommandQueue *command_queue = nullptr, cl::Event *event = nullptr); + int ReadOrWriteImage(void *buffer, void *data, bool is_read); + int ReadImage(void *buffer, void *dst_data); + int WriteImage(void *buffer, void *src_data); bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, bool sync = false) const; bool CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,