Browse Source

fix pad/conv2d/winograd bugs

tags/v1.2.0-rc1
wangdongxu 4 years ago
parent
commit
779d60faeb
7 changed files with 276 additions and 169 deletions
  1. +89
    -95
      mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
  2. +149
    -51
      mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
  3. +3
    -8
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
  4. +7
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc
  5. +23
    -9
      mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc
  6. +0
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h
  7. +5
    -4
      mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc

+ 89
- 95
mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl View File

@@ -91,11 +91,11 @@ __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t
out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
}

if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
} else {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
}
#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
#else
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
#endif
}

__kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
@@ -172,17 +172,17 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t
out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
}

if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
} // end if (oh1 < OH)
} else {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
} // end (oh1 < OH)
}
#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
} // end if (oh1 < OH)
#else
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
} // end (oh1 < OH)
#endif
}

__kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
@@ -283,29 +283,27 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t
out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1));
}

if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
} // end if (oh1 < OH)
if (co_slice1 < CO_SLICES) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
} // end if (oh1 < OH)
if (co_slice1 < CO_SLICES) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
} // end if (oh1 < OH)
} // end if (co_slice1 < CO_SLICES)
} else {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
} // end (oh1 < OH)
if (co_slice1 < CO_SLICES) {
WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
} // end if (oh1 < OH)
} // end if (co_slice1 < CO_SLICES)
}
} // end if (co_slice1 < CO_SLICES)
#else
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
} // end (oh1 < OH)
WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
} // end if (oh1 < OH)
#endif
}

__kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
@@ -456,37 +454,35 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t
out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1));
}

if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
} // end if (oh1 < OH)
if (co_slice1 < CO_SLICES) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
} // end if (oh1 < OH)
if (co_slice1 < CO_SLICES) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
} // end if (oh1 < OH)
} // end if (co_slice1 < CO_SLICES)
} else {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
} // end (oh1 < OH)
if (co_slice1 < CO_SLICES) {
WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1);
} // end if (oh1 < OH)
} // end if (co_slice1 < CO_SLICES)
}
} // end if (co_slice1 < CO_SLICES)
#else
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
} // end (oh1 < OH)
WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1);
} // end if (oh1 < OH)
#endif
}

__kernel void Conv2D_H2W2C2_Img(__read_only image2d_t input, __write_only image2d_t output,
@@ -644,35 +640,33 @@ __kernel void Conv2D_H2W2C2_Img(__read_only image2d_t input, __write_only image2
out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1));
}

if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
#ifndef EXCEDD_MAX_IMAGE2D_WIDTH
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
} // end if (oh1 < OH)
if (co_slice1 < CO_SLICES) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0);
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
} // end if (oh1 < OH)
if (co_slice1 < CO_SLICES) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1);
WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1);
} // end if (oh1 < OH)
} // end if (co_slice1 < CO_SLICES)
} else {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
} // end (oh1 < OH)
if (co_slice1 < CO_SLICES) {
WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1);
} // end if (oh1 < OH)
} // end if (co_slice1 < CO_SLICES)
}
} // end if (co_slice1 < CO_SLICES)
#else
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0);
WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0);
WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0);
} // end (oh1 < OH)
WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1);
WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1);
if (oh1 < OH) {
WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1);
WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1);
} // end if (oh1 < OH)
#endif
}

+ 149
- 51
mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl View File

@@ -35,29 +35,79 @@ __kernel void Winograd4x4To36(__read_only image2d_t input, // height=N*H
FLT4 BtD_row[6] = {0};
int h = tile_h * 4 - pad;
int w = tile_w * 4 - pad;
for (int y = 0; y < 6; y++) {
int x_idx = w * CI_SLICES + ci_slice;
for (int x = 0; x < 6; x++) {
// no need to check w: because ci_slice is in [0, CI_SLICES). when w<0, x_idx<0; w>=W, x_idx>=W*CI_SLICES
// if (w < 0 || w >= W) { continue; }
BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, h));
x_idx += CI_SLICES;
}
h++;
}

int y_idx = ci_slice * 36 + row * 6;
for (int y = 0; y < 6; y++) {
FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
for (int x = 0; x < 6; x++) {
acc += BtD_row[x] * Bt[y * 6 + x];
}
int x_idx = w * CI_SLICES + ci_slice;
FLT bt0 = Bt_row[0], bt1 = Bt_row[1], bt2 = Bt_row[2], bt3 = Bt_row[3], bt4 = Bt_row[4], bt5 = Bt_row[5];
BtD_row[0] =
bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));
x_idx += CI_SLICES;
BtD_row[1] =
bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));
x_idx += CI_SLICES;
BtD_row[2] =
bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));
x_idx += CI_SLICES;
BtD_row[3] =
bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));
x_idx += CI_SLICES;
BtD_row[4] =
bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));
x_idx += CI_SLICES;
BtD_row[5] =
bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) +
bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) +
bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5));

#if FP16_ENABLE
acc = min(acc, HALF_MAX);
acc = max(acc, -HALF_MAX);
#ifndef HALF_MAX // adreno not exist
#define HALF_MAX 0x1.ffcp15h
#endif
WRITE_IMAGE(output, (int2)(tile_hw, y_idx + y), acc);
}
#define LimitAcc() \
acc = min(acc, HALF_MAX); \
acc = max(acc, -HALF_MAX);
#else
#define LimitAcc() \
{}
#endif

int y_idx = ci_slice * 36 + row * 6;
FLT4 acc = BtD_row[0] + (FLT)(-2.5f) * BtD_row[2] + BtD_row[4];
LimitAcc();
WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);

FLT4 tmp0 = (FLT)(0.9428091049f) * BtD_row[1] + (FLT)(-0.4714044929f) * BtD_row[3];
FLT4 tmp1 = (FLT)(1.3333333731f) * BtD_row[2] + (FLT)(-0.6666667461f) * BtD_row[4];
acc = tmp0 + tmp1;
LimitAcc();
WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);

acc = -tmp0 + tmp1;
LimitAcc();
WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);

tmp0 = (FLT)(-0.1178511307f) * BtD_row[1] + (FLT)(0.2357022613f) * BtD_row[3];
tmp1 = (FLT)(-0.0833333358f) * BtD_row[2] + (FLT)(0.1666666865f) * BtD_row[4];
acc = tmp0 + tmp1;
LimitAcc();
WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);

acc = -tmp0 + tmp1;
LimitAcc();
WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);

acc = BtD_row[1] + (FLT)(-2.5f) * BtD_row[3] + BtD_row[5];
LimitAcc();
WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc);
}

__kernel void WinogradConv2D(__read_only image2d_t input, // height=CI_SLICES*36 width=TILE_HW
@@ -181,6 +231,22 @@ constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f, 1.000000000
0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f,
0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f};

#define UpdateAcc() \
if (bias != 0) acc += bias[co_slice]; \
if (act_type == ActivationType_RELU) { \
acc = max(acc, (FLT4)(0.0f)); \
} else if (act_type == ActivationType_RELU6) { \
acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f)); \
} else if (act_type == ActivationType_TANH) { \
FLT4 exp0 = exp(acc); \
FLT4 exp1 = exp(-acc); \
acc = (exp0 - exp1) / (exp0 + exp1); \
} else if (act_type == ActivationType_LEAKY_RELU) { \
DO_LEAKY_RELU(acc, alpha); \
} else if (act_type == ActivationType_SIGMOID) { \
acc = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-acc)); \
}

__kernel void Winograd36To4x4(__read_only image2d_t input, // height=CO_SLICES*36 width=TILE_HW
__write_only image2d_t output, // height=N*H width=W*CO_SLICES
__global FLT4 *bias,
@@ -198,11 +264,49 @@ __kernel void Winograd36To4x4(__read_only image2d_t input, // height=CO_SLICE

constant FLT *At_row = At + row * 6;
FLT4 AtM_row[6] = {0};
for (int y = 0, idx = co_slice * 36; y < 6; y++) {
for (int x = 0; x < 6; x++, idx++) {
AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx));
}
}
int idx = co_slice * 36;
FLT at = At_row[0];
AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
at = At_row[1];
AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
at = At_row[2];
AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
at = At_row[3];
AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
at = At_row[4];
AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
at = At_row[5];
AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));
AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++));

int TILE_W = UP_DIV(W, 4);
int tile_w = tile_hw % TILE_W;
@@ -210,30 +314,24 @@ __kernel void Winograd36To4x4(__read_only image2d_t input, // height=CO_SLICE
int h = tile_h * 4 + row;
int w = tile_w * 4;
int x_idx = w * CO_SLICES + co_slice;
for (int x = 0, idx = 0; x < 4; x++) {
FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
for (int y = 0; y < 6; y++, idx++) {
acc += AtM_row[y] * At[idx];
}

if (bias != 0) {
acc += bias[co_slice];
}

if (act_type == ActivationType_RELU) {
acc = max(acc, (FLT4)(0.0f));
} else if (act_type == ActivationType_RELU6) {
acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));
} else if (act_type == ActivationType_TANH) {
FLT4 exp0 = exp(acc);
FLT4 exp1 = exp(-acc);
acc = (exp0 - exp1) / (exp0 + exp1);
} else if (act_type == ActivationType_LEAKY_RELU) {
DO_LEAKY_RELU(acc, alpha);
} else if (act_type == ActivationType_SIGMOID) {
acc = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-acc));
}
WRITE_IMAGE(output, (int2)(x_idx, h), acc);
x_idx += CO_SLICES;
}

FLT4 acc = AtM_row[0] + AtM_row[1] + AtM_row[2] + AtM_row[3] + AtM_row[4];
UpdateAcc();
WRITE_IMAGE(output, (int2)(x_idx, h), acc);
x_idx += CO_SLICES;

acc = (FLT)(0.7071067691f) * (AtM_row[1] - AtM_row[2]) + (FLT)(1.4142135382f) * (AtM_row[3] - AtM_row[4]);
UpdateAcc();
WRITE_IMAGE(output, (int2)(x_idx, h), acc);
x_idx += CO_SLICES;

acc = (FLT)(0.5f) * (AtM_row[1] + AtM_row[2]) + (FLT)(2.0f) * (AtM_row[3] + AtM_row[4]);
UpdateAcc();
WRITE_IMAGE(output, (int2)(x_idx, h), acc);
x_idx += CO_SLICES;

acc =
(FLT)(0.3535533845f) * (AtM_row[1] - AtM_row[2]) + (FLT)(2.8284270763f) * (AtM_row[3] - AtM_row[4]) + AtM_row[5];
UpdateAcc();
WRITE_IMAGE(output, (int2)(x_idx, h), acc);
}

+ 3
- 8
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc View File

@@ -144,7 +144,9 @@ void Conv2DOpenCLKernel::BuildKernel() {
kernel_name << "_Img";
}
ocl_runtime_->LoadSource(program_name, GetActDefines() + conv2d_source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name.str());
std::string build_option =
(OW_ * CO_SLICES_ <= ocl_runtime_->GetMaxImage2DWidth()) ? "" : " -DEXCEDD_MAX_IMAGE2D_WIDTH";
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name.str(), {build_option});
}

void Conv2DOpenCLKernel::SetBlockSize() {
@@ -436,13 +438,6 @@ OpParameter *CreateFcParam(const ConvParameter *conv_param, const std::vector<li

bool UseWinograd4x4To6x6(const ConvParameter *param, const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs) {
// not use winograd on adreno gpu
lite::opencl::OpenCLRuntimeWrapper runtime_wrap;
lite::opencl::OpenCLRuntime *runtime = runtime_wrap.GetInstance();
if (runtime->GetGpuInfo().type == lite::opencl::GpuType::ADRENO) {
return false;
}

if (!(inputs.size() == 2 || inputs.size() == 3) || outputs.empty()) {
return false;
}


+ 7
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc View File

@@ -62,6 +62,11 @@ int PadOpenCLKernel::CheckSpecs() {
MS_LOG(ERROR) << "Pad only support CONSTANT MODE.";
return RET_ERROR;
}
auto pad_shape = in_tensors_.at(1)->shape();
if (pad_shape.size() != 2 || pad_shape[0] != in_ndim || pad_shape[1] != 2) {
MS_LOG(ERROR) << "pad tensor shape invalid.";
return RET_ERROR;
}
return RET_OK;
}

@@ -86,8 +91,9 @@ void PadOpenCLKernel::SetConstArgs() {
int ndim = in_tensors_.front()->shape().size();
std::vector<int> pad_before_ori;
pad_before_ori.reserve(ndim);
auto paddings = reinterpret_cast<int32_t *>(in_tensors_.at(1)->data_c());
for (size_t i = 0; i < ndim; i++) {
pad_before_ori.push_back(param_->paddings_[MAX_PAD_SIZE - 2 * ndim + 2 * i]);
pad_before_ori.push_back(paddings[2 * i]);
}
cl_int4 pad_before;
Broadcast2GpuShape(pad_before.s, pad_before_ori.data(), ndim, 0);


+ 23
- 9
mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc View File

@@ -15,7 +15,9 @@
*/

#include "src/runtime/kernel/opencl/kernel/winograd.h"
#include <memory>
#include "src/runtime/kernel/opencl/cl/winograd.cl.inc"
#include "nnacl/base/minimal_filtering_generator.h"

using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK;
@@ -30,13 +32,15 @@ void Align(const std::vector<int> &global, const std::vector<int> &local, cl::ND
cl::NDRange(UP_ROUND(global[0], local[0]), UP_ROUND(global[1], local[1]), UP_ROUND(global[2], local[2]));
}

constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.0000000000,
0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
0.0000000000, 0.4999999702, 0.4999999702, 1.9999998808, 1.9999998808, 1.0000000000};

#ifndef ENABLE_ARM64
constexpr float G[] = {1.0000000000, 0.0000000000, 0.0000000000, 1.0000000000, 0.7071067691, 0.4999999702,
1.0000000000, -0.7071067691, 0.4999999702, 1.0000000000, 1.4142135382, 1.9999998808,
1.0000000000, -1.4142135382, 1.9999998808, 0.0000000000, 0.0000000000, 1.0000000000};
std::vector<float> GenerateWinogradFilter(void *src, TypeId dtype, size_t CO, size_t CI) {
constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.0000000000,
0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
0.0000000000, 0.4999999702, 0.4999999702, 1.9999998808, 1.9999998808, 1.0000000000};
constexpr float G[] = {1.0000000000, 0.0000000000, 0.0000000000, 1.0000000000, 0.7071067691, 0.4999999702,
1.0000000000, -0.7071067691, 0.4999999702, 1.0000000000, 1.4142135382, 1.9999998808,
1.0000000000, -1.4142135382, 1.9999998808, 0.0000000000, 0.0000000000, 1.0000000000};
auto src_fp32 = reinterpret_cast<float *>(src);
auto src_fp16 = reinterpret_cast<float16_t *>(src);
std::function<float(int)> access_func;
@@ -71,6 +75,8 @@ std::vector<float> GenerateWinogradFilter(void *src, TypeId dtype, size_t CO, si
}
return dst;
}
#endif

} // namespace

void WinogradOpenCLKernel::BuildKernel() {
@@ -106,9 +112,17 @@ void WinogradOpenCLKernel::InitFilter() {

// rearrange filter
auto filter_tensor = in_tensors_.at(1);
#ifndef ENABLE_ARM64
auto winograd_filter = GenerateWinogradFilter(filter_tensor->data_c(), filter_tensor->data_type(), CO_, CI_);

void *src_data = winograd_filter.data();
#else
std::unique_ptr<float[]> winograd_filter(new float[CO_ * 6 * 6 * CI_]);
WinogradWeightTransform(reinterpret_cast<const float *>(filter_tensor->data_c()),
reinterpret_cast<float *>(winograd_filter.get()), nullptr, Gt, 1, 6, 3, CI_, CO_, false);

void *src_data = winograd_filter.get();
#endif

auto src_dtype = kNumberTypeFloat32;
auto dst_dtype = use_fp16_ ? kNumberTypeFloat16 : kNumberTypeFloat32;
std::vector<char> tmp(size, 0);
@@ -173,8 +187,8 @@ void WinogradOpenCLKernel::SetConstArgs() {

void WinogradOpenCLKernel::SetGlobalLocal() {
Align({TILE_HW_, 6, CI_SLICES_}, {8, 6, 4}, &global_4x4to36_, &local_4x4to36_);
Align({UP_DIV(TILE_HW_, 2), 36, UP_DIV(CO_SLICES_, 2)}, {8, 6, 2}, &global_range_, &local_range_);
Align({TILE_HW_, 4, CO_SLICES_}, {32, 4, 2}, &global_36to4x4_, &local_36to4x4_);
Align({UP_DIV(TILE_HW_, 2), 36, UP_DIV(CO_SLICES_, 2)}, {8, 3, 8}, &global_range_, &local_range_);
Align({TILE_HW_, 4, CO_SLICES_}, {4, 4, 8}, &global_36to4x4_, &local_36to4x4_);
}

int WinogradOpenCLKernel::Run() {


+ 0
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.h View File

@@ -29,7 +29,6 @@ class WinogradOpenCLKernel : public Conv2DOpenCLKernel {
const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
: Conv2DOpenCLKernel(parameter, inputs, outputs, ctx) {
use_winograd_ = true;
filter_type_ = MemType::BUF;
}

~WinogradOpenCLKernel() override = default;


+ 5
- 4
mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc View File

@@ -243,10 +243,11 @@ void TryMergePadXxx(LiteKernel *node, std::set<LiteKernel *> *removed_set, std::

auto *conv_param = reinterpret_cast<ParamType *>(reinterpret_cast<OpenCLKernel *>(node)->GetParameter());
MS_ASSERT(conv_param);
conv_param->pad_u_ += pad_param->paddings_[2];
conv_param->pad_d_ += pad_param->paddings_[3];
conv_param->pad_l_ += pad_param->paddings_[4];
conv_param->pad_r_ += pad_param->paddings_[5];
auto paddings = reinterpret_cast<int32_t *>(pad->in_tensors().at(1)->data_c());
conv_param->pad_u_ += paddings[2];
conv_param->pad_d_ += paddings[3];
conv_param->pad_l_ += paddings[4];
conv_param->pad_r_ += paddings[5];
pad->set_in_tensors({pad->in_tensors().front()});
MergeRemoveA(pad, node, removed_set);
MS_LOG(DEBUG) << "Merge Pad and " + GetTypeName(node) + " success";


Loading…
Cancel
Save