| @@ -91,11 +91,11 @@ __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t | |||
| out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0)); | |||
| } | |||
| if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); | |||
| } else { | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0); | |||
| } | |||
| #ifndef EXCEDD_MAX_IMAGE2D_WIDTH | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); | |||
| #else | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0); | |||
| #endif | |||
| } | |||
| __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, | |||
| @@ -172,17 +172,17 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t | |||
| out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0)); | |||
| } | |||
| if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); | |||
| } // end if (oh1 < OH) | |||
| } else { | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0); | |||
| } // end (oh1 < OH) | |||
| } | |||
| #ifndef EXCEDD_MAX_IMAGE2D_WIDTH | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); | |||
| } // end if (oh1 < OH) | |||
| #else | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0); | |||
| } // end (oh1 < OH) | |||
| #endif | |||
| } | |||
| __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, | |||
| @@ -283,29 +283,27 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t | |||
| out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1)); | |||
| } | |||
| if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); | |||
| #ifndef EXCEDD_MAX_IMAGE2D_WIDTH | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); | |||
| } // end if (oh1 < OH) | |||
| if (co_slice1 < CO_SLICES) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1); | |||
| } // end if (oh1 < OH) | |||
| if (co_slice1 < CO_SLICES) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1); | |||
| } // end if (oh1 < OH) | |||
| } // end if (co_slice1 < CO_SLICES) | |||
| } else { | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0); | |||
| } // end (oh1 < OH) | |||
| if (co_slice1 < CO_SLICES) { | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1); | |||
| } // end if (oh1 < OH) | |||
| } // end if (co_slice1 < CO_SLICES) | |||
| } | |||
| } // end if (co_slice1 < CO_SLICES) | |||
| #else | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0); | |||
| } // end (oh1 < OH) | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1); | |||
| } // end if (oh1 < OH) | |||
| #endif | |||
| } | |||
| __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, | |||
| @@ -456,37 +454,35 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t | |||
| out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1)); | |||
| } | |||
| if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0); | |||
| #ifndef EXCEDD_MAX_IMAGE2D_WIDTH | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0); | |||
| } // end if (oh1 < OH) | |||
| if (co_slice1 < CO_SLICES) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1); | |||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0); | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1); | |||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1); | |||
| } // end if (oh1 < OH) | |||
| if (co_slice1 < CO_SLICES) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1); | |||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1); | |||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1); | |||
| } // end if (oh1 < OH) | |||
| } // end if (co_slice1 < CO_SLICES) | |||
| } else { | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0); | |||
| } // end (oh1 < OH) | |||
| if (co_slice1 < CO_SLICES) { | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1); | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1); | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1); | |||
| } // end if (oh1 < OH) | |||
| } // end if (co_slice1 < CO_SLICES) | |||
| } | |||
| } // end if (co_slice1 < CO_SLICES) | |||
| #else | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0); | |||
| } // end (oh1 < OH) | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1); | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1); | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1); | |||
| } // end if (oh1 < OH) | |||
| #endif | |||
| } | |||
| __kernel void Conv2D_H2W2C2_Img(__read_only image2d_t input, __write_only image2d_t output, | |||
| @@ -644,35 +640,33 @@ __kernel void Conv2D_H2W2C2_Img(__read_only image2d_t input, __write_only image2 | |||
| out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1)); | |||
| } | |||
| if (OW * CO_SLICES <= MAX_IMAGE2D_WIDTH) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0); | |||
| #ifndef EXCEDD_MAX_IMAGE2D_WIDTH | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh0), out_h0_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh0), out_h0_w1_c0); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0); | |||
| } // end if (oh1 < OH) | |||
| if (co_slice1 < CO_SLICES) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1); | |||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice0, n_oh1), out_h1_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice0, n_oh1), out_h1_w1_c0); | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1); | |||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1); | |||
| } // end if (oh1 < OH) | |||
| if (co_slice1 < CO_SLICES) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh0), out_h0_w0_c1); | |||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh0), out_h0_w1_c1); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(ow0 * CO_SLICES + co_slice1, n_oh1), out_h1_w0_c1); | |||
| WRITE_IMAGE(output, (int2)(ow1 * CO_SLICES + co_slice1, n_oh1), out_h1_w1_c1); | |||
| } // end if (oh1 < OH) | |||
| } // end if (co_slice1 < CO_SLICES) | |||
| } else { | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0); | |||
| } // end (oh1 < OH) | |||
| if (co_slice1 < CO_SLICES) { | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1); | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1); | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1); | |||
| } // end if (oh1 < OH) | |||
| } // end if (co_slice1 < CO_SLICES) | |||
| } | |||
| } // end if (co_slice1 < CO_SLICES) | |||
| #else | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow0), out_h0_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh0 * OW + ow1), out_h0_w1_c0); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow0), out_h1_w0_c0); | |||
| WRITE_IMAGE(output, (int2)(co_slice0, n_oh1 * OW + ow1), out_h1_w1_c0); | |||
| } // end (oh1 < OH) | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow0), out_h0_w0_c1); | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh0 * OW + ow1), out_h0_w1_c1); | |||
| if (oh1 < OH) { | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow0), out_h1_w0_c1); | |||
| WRITE_IMAGE(output, (int2)(co_slice1, n_oh1 * OW + ow1), out_h1_w1_c1); | |||
| } // end if (oh1 < OH) | |||
| #endif | |||
| } | |||
| @@ -35,29 +35,79 @@ __kernel void Winograd4x4To36(__read_only image2d_t input, // height=N*H | |||
| FLT4 BtD_row[6] = {0}; | |||
| int h = tile_h * 4 - pad; | |||
| int w = tile_w * 4 - pad; | |||
| for (int y = 0; y < 6; y++) { | |||
| int x_idx = w * CI_SLICES + ci_slice; | |||
| for (int x = 0; x < 6; x++) { | |||
| // no need to check w: because ci_slice is in [0, CI_SLICES). when w<0, x_idx<0; w>=W, x_idx>=W*CI_SLICES | |||
| // if (w < 0 || w >= W) { continue; } | |||
| BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_zero, (int2)(x_idx, h)); | |||
| x_idx += CI_SLICES; | |||
| } | |||
| h++; | |||
| } | |||
| int y_idx = ci_slice * 36 + row * 6; | |||
| for (int y = 0; y < 6; y++) { | |||
| FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||
| for (int x = 0; x < 6; x++) { | |||
| acc += BtD_row[x] * Bt[y * 6 + x]; | |||
| } | |||
| int x_idx = w * CI_SLICES + ci_slice; | |||
| FLT bt0 = Bt_row[0], bt1 = Bt_row[1], bt2 = Bt_row[2], bt3 = Bt_row[3], bt4 = Bt_row[4], bt5 = Bt_row[5]; | |||
| BtD_row[0] = | |||
| bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) + | |||
| bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) + | |||
| bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5)); | |||
| x_idx += CI_SLICES; | |||
| BtD_row[1] = | |||
| bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) + | |||
| bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) + | |||
| bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5)); | |||
| x_idx += CI_SLICES; | |||
| BtD_row[2] = | |||
| bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) + | |||
| bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) + | |||
| bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5)); | |||
| x_idx += CI_SLICES; | |||
| BtD_row[3] = | |||
| bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) + | |||
| bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) + | |||
| bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5)); | |||
| x_idx += CI_SLICES; | |||
| BtD_row[4] = | |||
| bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) + | |||
| bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) + | |||
| bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5)); | |||
| x_idx += CI_SLICES; | |||
| BtD_row[5] = | |||
| bt0 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 0)) + bt1 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 1)) + | |||
| bt2 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 2)) + bt3 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 3)) + | |||
| bt4 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 4)) + bt5 * READ_IMAGE(input, smp_zero, (int2)(x_idx, h + 5)); | |||
| #if FP16_ENABLE | |||
| acc = min(acc, HALF_MAX); | |||
| acc = max(acc, -HALF_MAX); | |||
| #ifndef HALF_MAX // adreno not exist | |||
| #define HALF_MAX 0x1.ffcp15h | |||
| #endif | |||
| WRITE_IMAGE(output, (int2)(tile_hw, y_idx + y), acc); | |||
| } | |||
| #define LimitAcc() \ | |||
| acc = min(acc, HALF_MAX); \ | |||
| acc = max(acc, -HALF_MAX); | |||
| #else | |||
| #define LimitAcc() \ | |||
| {} | |||
| #endif | |||
| int y_idx = ci_slice * 36 + row * 6; | |||
| FLT4 acc = BtD_row[0] + (FLT)(-2.5f) * BtD_row[2] + BtD_row[4]; | |||
| LimitAcc(); | |||
| WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc); | |||
| FLT4 tmp0 = (FLT)(0.9428091049f) * BtD_row[1] + (FLT)(-0.4714044929f) * BtD_row[3]; | |||
| FLT4 tmp1 = (FLT)(1.3333333731f) * BtD_row[2] + (FLT)(-0.6666667461f) * BtD_row[4]; | |||
| acc = tmp0 + tmp1; | |||
| LimitAcc(); | |||
| WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc); | |||
| acc = -tmp0 + tmp1; | |||
| LimitAcc(); | |||
| WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc); | |||
| tmp0 = (FLT)(-0.1178511307f) * BtD_row[1] + (FLT)(0.2357022613f) * BtD_row[3]; | |||
| tmp1 = (FLT)(-0.0833333358f) * BtD_row[2] + (FLT)(0.1666666865f) * BtD_row[4]; | |||
| acc = tmp0 + tmp1; | |||
| LimitAcc(); | |||
| WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc); | |||
| acc = -tmp0 + tmp1; | |||
| LimitAcc(); | |||
| WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc); | |||
| acc = BtD_row[1] + (FLT)(-2.5f) * BtD_row[3] + BtD_row[5]; | |||
| LimitAcc(); | |||
| WRITE_IMAGE(output, (int2)(tile_hw, y_idx++), acc); | |||
| } | |||
| __kernel void WinogradConv2D(__read_only image2d_t input, // height=CI_SLICES*36 width=TILE_HW | |||
| @@ -181,6 +231,22 @@ constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f, 1.000000000 | |||
| 0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f, | |||
| 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f}; | |||
| #define UpdateAcc() \ | |||
| if (bias != 0) acc += bias[co_slice]; \ | |||
| if (act_type == ActivationType_RELU) { \ | |||
| acc = max(acc, (FLT4)(0.0f)); \ | |||
| } else if (act_type == ActivationType_RELU6) { \ | |||
| acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f)); \ | |||
| } else if (act_type == ActivationType_TANH) { \ | |||
| FLT4 exp0 = exp(acc); \ | |||
| FLT4 exp1 = exp(-acc); \ | |||
| acc = (exp0 - exp1) / (exp0 + exp1); \ | |||
| } else if (act_type == ActivationType_LEAKY_RELU) { \ | |||
| DO_LEAKY_RELU(acc, alpha); \ | |||
| } else if (act_type == ActivationType_SIGMOID) { \ | |||
| acc = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-acc)); \ | |||
| } | |||
| __kernel void Winograd36To4x4(__read_only image2d_t input, // height=CO_SLICES*36 width=TILE_HW | |||
| __write_only image2d_t output, // height=N*H width=W*CO_SLICES | |||
| __global FLT4 *bias, | |||
| @@ -198,11 +264,49 @@ __kernel void Winograd36To4x4(__read_only image2d_t input, // height=CO_SLICE | |||
| constant FLT *At_row = At + row * 6; | |||
| FLT4 AtM_row[6] = {0}; | |||
| for (int y = 0, idx = co_slice * 36; y < 6; y++) { | |||
| for (int x = 0; x < 6; x++, idx++) { | |||
| AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx)); | |||
| } | |||
| } | |||
| int idx = co_slice * 36; | |||
| FLT at = At_row[0]; | |||
| AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| at = At_row[1]; | |||
| AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| at = At_row[2]; | |||
| AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| at = At_row[3]; | |||
| AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| at = At_row[4]; | |||
| AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| at = At_row[5]; | |||
| AtM_row[0] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[1] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[2] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[3] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[4] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| AtM_row[5] += at * READ_IMAGE(input, smp_zero, (int2)(tile_hw, idx++)); | |||
| int TILE_W = UP_DIV(W, 4); | |||
| int tile_w = tile_hw % TILE_W; | |||
| @@ -210,30 +314,24 @@ __kernel void Winograd36To4x4(__read_only image2d_t input, // height=CO_SLICE | |||
| int h = tile_h * 4 + row; | |||
| int w = tile_w * 4; | |||
| int x_idx = w * CO_SLICES + co_slice; | |||
| for (int x = 0, idx = 0; x < 4; x++) { | |||
| FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||
| for (int y = 0; y < 6; y++, idx++) { | |||
| acc += AtM_row[y] * At[idx]; | |||
| } | |||
| if (bias != 0) { | |||
| acc += bias[co_slice]; | |||
| } | |||
| if (act_type == ActivationType_RELU) { | |||
| acc = max(acc, (FLT4)(0.0f)); | |||
| } else if (act_type == ActivationType_RELU6) { | |||
| acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f)); | |||
| } else if (act_type == ActivationType_TANH) { | |||
| FLT4 exp0 = exp(acc); | |||
| FLT4 exp1 = exp(-acc); | |||
| acc = (exp0 - exp1) / (exp0 + exp1); | |||
| } else if (act_type == ActivationType_LEAKY_RELU) { | |||
| DO_LEAKY_RELU(acc, alpha); | |||
| } else if (act_type == ActivationType_SIGMOID) { | |||
| acc = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-acc)); | |||
| } | |||
| WRITE_IMAGE(output, (int2)(x_idx, h), acc); | |||
| x_idx += CO_SLICES; | |||
| } | |||
| FLT4 acc = AtM_row[0] + AtM_row[1] + AtM_row[2] + AtM_row[3] + AtM_row[4]; | |||
| UpdateAcc(); | |||
| WRITE_IMAGE(output, (int2)(x_idx, h), acc); | |||
| x_idx += CO_SLICES; | |||
| acc = (FLT)(0.7071067691f) * (AtM_row[1] - AtM_row[2]) + (FLT)(1.4142135382f) * (AtM_row[3] - AtM_row[4]); | |||
| UpdateAcc(); | |||
| WRITE_IMAGE(output, (int2)(x_idx, h), acc); | |||
| x_idx += CO_SLICES; | |||
| acc = (FLT)(0.5f) * (AtM_row[1] + AtM_row[2]) + (FLT)(2.0f) * (AtM_row[3] + AtM_row[4]); | |||
| UpdateAcc(); | |||
| WRITE_IMAGE(output, (int2)(x_idx, h), acc); | |||
| x_idx += CO_SLICES; | |||
| acc = | |||
| (FLT)(0.3535533845f) * (AtM_row[1] - AtM_row[2]) + (FLT)(2.8284270763f) * (AtM_row[3] - AtM_row[4]) + AtM_row[5]; | |||
| UpdateAcc(); | |||
| WRITE_IMAGE(output, (int2)(x_idx, h), acc); | |||
| } | |||
| @@ -144,7 +144,9 @@ void Conv2DOpenCLKernel::BuildKernel() { | |||
| kernel_name << "_Img"; | |||
| } | |||
| ocl_runtime_->LoadSource(program_name, GetActDefines() + conv2d_source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name.str()); | |||
| std::string build_option = | |||
| (OW_ * CO_SLICES_ <= ocl_runtime_->GetMaxImage2DWidth()) ? "" : " -DEXCEDD_MAX_IMAGE2D_WIDTH"; | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name.str(), {build_option}); | |||
| } | |||
| void Conv2DOpenCLKernel::SetBlockSize() { | |||
| @@ -436,13 +438,6 @@ OpParameter *CreateFcParam(const ConvParameter *conv_param, const std::vector<li | |||
| bool UseWinograd4x4To6x6(const ConvParameter *param, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs) { | |||
| // not use winograd on adreno gpu | |||
| lite::opencl::OpenCLRuntimeWrapper runtime_wrap; | |||
| lite::opencl::OpenCLRuntime *runtime = runtime_wrap.GetInstance(); | |||
| if (runtime->GetGpuInfo().type == lite::opencl::GpuType::ADRENO) { | |||
| return false; | |||
| } | |||
| if (!(inputs.size() == 2 || inputs.size() == 3) || outputs.empty()) { | |||
| return false; | |||
| } | |||
| @@ -62,6 +62,11 @@ int PadOpenCLKernel::CheckSpecs() { | |||
| MS_LOG(ERROR) << "Pad only support CONSTANT MODE."; | |||
| return RET_ERROR; | |||
| } | |||
| auto pad_shape = in_tensors_.at(1)->shape(); | |||
| if (pad_shape.size() != 2 || pad_shape[0] != in_ndim || pad_shape[1] != 2) { | |||
| MS_LOG(ERROR) << "pad tensor shape invalid."; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -86,8 +91,9 @@ void PadOpenCLKernel::SetConstArgs() { | |||
| int ndim = in_tensors_.front()->shape().size(); | |||
| std::vector<int> pad_before_ori; | |||
| pad_before_ori.reserve(ndim); | |||
| auto paddings = reinterpret_cast<int32_t *>(in_tensors_.at(1)->data_c()); | |||
| for (size_t i = 0; i < ndim; i++) { | |||
| pad_before_ori.push_back(param_->paddings_[MAX_PAD_SIZE - 2 * ndim + 2 * i]); | |||
| pad_before_ori.push_back(paddings[2 * i]); | |||
| } | |||
| cl_int4 pad_before; | |||
| Broadcast2GpuShape(pad_before.s, pad_before_ori.data(), ndim, 0); | |||
| @@ -15,7 +15,9 @@ | |||
| */ | |||
| #include "src/runtime/kernel/opencl/kernel/winograd.h" | |||
| #include <memory> | |||
| #include "src/runtime/kernel/opencl/cl/winograd.cl.inc" | |||
| #include "nnacl/base/minimal_filtering_generator.h" | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_OK; | |||
| @@ -30,13 +32,15 @@ void Align(const std::vector<int> &global, const std::vector<int> &local, cl::ND | |||
| cl::NDRange(UP_ROUND(global[0], local[0]), UP_ROUND(global[1], local[1]), UP_ROUND(global[2], local[2])); | |||
| } | |||
| constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.0000000000, | |||
| 0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000, | |||
| 0.0000000000, 0.4999999702, 0.4999999702, 1.9999998808, 1.9999998808, 1.0000000000}; | |||
| #ifndef ENABLE_ARM64 | |||
| constexpr float G[] = {1.0000000000, 0.0000000000, 0.0000000000, 1.0000000000, 0.7071067691, 0.4999999702, | |||
| 1.0000000000, -0.7071067691, 0.4999999702, 1.0000000000, 1.4142135382, 1.9999998808, | |||
| 1.0000000000, -1.4142135382, 1.9999998808, 0.0000000000, 0.0000000000, 1.0000000000}; | |||
| std::vector<float> GenerateWinogradFilter(void *src, TypeId dtype, size_t CO, size_t CI) { | |||
| constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.0000000000, | |||
| 0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000, | |||
| 0.0000000000, 0.4999999702, 0.4999999702, 1.9999998808, 1.9999998808, 1.0000000000}; | |||
| constexpr float G[] = {1.0000000000, 0.0000000000, 0.0000000000, 1.0000000000, 0.7071067691, 0.4999999702, | |||
| 1.0000000000, -0.7071067691, 0.4999999702, 1.0000000000, 1.4142135382, 1.9999998808, | |||
| 1.0000000000, -1.4142135382, 1.9999998808, 0.0000000000, 0.0000000000, 1.0000000000}; | |||
| auto src_fp32 = reinterpret_cast<float *>(src); | |||
| auto src_fp16 = reinterpret_cast<float16_t *>(src); | |||
| std::function<float(int)> access_func; | |||
| @@ -71,6 +75,8 @@ std::vector<float> GenerateWinogradFilter(void *src, TypeId dtype, size_t CO, si | |||
| } | |||
| return dst; | |||
| } | |||
| #endif | |||
| } // namespace | |||
| void WinogradOpenCLKernel::BuildKernel() { | |||
| @@ -106,9 +112,17 @@ void WinogradOpenCLKernel::InitFilter() { | |||
| // rearrange filter | |||
| auto filter_tensor = in_tensors_.at(1); | |||
| #ifndef ENABLE_ARM64 | |||
| auto winograd_filter = GenerateWinogradFilter(filter_tensor->data_c(), filter_tensor->data_type(), CO_, CI_); | |||
| void *src_data = winograd_filter.data(); | |||
| #else | |||
| std::unique_ptr<float[]> winograd_filter(new float[CO_ * 6 * 6 * CI_]); | |||
| WinogradWeightTransform(reinterpret_cast<const float *>(filter_tensor->data_c()), | |||
| reinterpret_cast<float *>(winograd_filter.get()), nullptr, Gt, 1, 6, 3, CI_, CO_, false); | |||
| void *src_data = winograd_filter.get(); | |||
| #endif | |||
| auto src_dtype = kNumberTypeFloat32; | |||
| auto dst_dtype = use_fp16_ ? kNumberTypeFloat16 : kNumberTypeFloat32; | |||
| std::vector<char> tmp(size, 0); | |||
| @@ -173,8 +187,8 @@ void WinogradOpenCLKernel::SetConstArgs() { | |||
| void WinogradOpenCLKernel::SetGlobalLocal() { | |||
| Align({TILE_HW_, 6, CI_SLICES_}, {8, 6, 4}, &global_4x4to36_, &local_4x4to36_); | |||
| Align({UP_DIV(TILE_HW_, 2), 36, UP_DIV(CO_SLICES_, 2)}, {8, 6, 2}, &global_range_, &local_range_); | |||
| Align({TILE_HW_, 4, CO_SLICES_}, {32, 4, 2}, &global_36to4x4_, &local_36to4x4_); | |||
| Align({UP_DIV(TILE_HW_, 2), 36, UP_DIV(CO_SLICES_, 2)}, {8, 3, 8}, &global_range_, &local_range_); | |||
| Align({TILE_HW_, 4, CO_SLICES_}, {4, 4, 8}, &global_36to4x4_, &local_36to4x4_); | |||
| } | |||
| int WinogradOpenCLKernel::Run() { | |||
| @@ -29,7 +29,6 @@ class WinogradOpenCLKernel : public Conv2DOpenCLKernel { | |||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx) | |||
| : Conv2DOpenCLKernel(parameter, inputs, outputs, ctx) { | |||
| use_winograd_ = true; | |||
| filter_type_ = MemType::BUF; | |||
| } | |||
| ~WinogradOpenCLKernel() override = default; | |||
| @@ -243,10 +243,11 @@ void TryMergePadXxx(LiteKernel *node, std::set<LiteKernel *> *removed_set, std:: | |||
| auto *conv_param = reinterpret_cast<ParamType *>(reinterpret_cast<OpenCLKernel *>(node)->GetParameter()); | |||
| MS_ASSERT(conv_param); | |||
| conv_param->pad_u_ += pad_param->paddings_[2]; | |||
| conv_param->pad_d_ += pad_param->paddings_[3]; | |||
| conv_param->pad_l_ += pad_param->paddings_[4]; | |||
| conv_param->pad_r_ += pad_param->paddings_[5]; | |||
| auto paddings = reinterpret_cast<int32_t *>(pad->in_tensors().at(1)->data_c()); | |||
| conv_param->pad_u_ += paddings[2]; | |||
| conv_param->pad_d_ += paddings[3]; | |||
| conv_param->pad_l_ += paddings[4]; | |||
| conv_param->pad_r_ += paddings[5]; | |||
| pad->set_in_tensors({pad->in_tensors().front()}); | |||
| MergeRemoveA(pad, node, removed_set); | |||
| MS_LOG(DEBUG) << "Merge Pad and " + GetTypeName(node) + " success"; | |||