From: @chenzupeng Reviewed-by: @ddwsky,@zhanghaibo5 Signed-off-by: @zhanghaibo5tags/v1.2.0-rc1
| @@ -7,12 +7,13 @@ __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, | |||
| const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(2); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(0); | |||
| int X = get_global_id(2); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(0); // C4 | |||
| int N = X / output_shape.y; | |||
| X = X % output_shape.y; | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| if (N >= output_shape.x || X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| @@ -23,28 +24,30 @@ __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| bool outside_y = y_c < 0 || y_c >= input_shape.y; | |||
| bool outside_y = y_c < 0 || y_c >= input_shape.z; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; | |||
| r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f); | |||
| bool outside = outside_y || x_c < 0 || x_c >= input_shape.y; | |||
| r += | |||
| !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, N * input_shape.y + x_c)) : (FLT4)(0.0f); | |||
| window_size += !outside ? 1.0f : 0.0f; | |||
| } | |||
| } | |||
| FLT4 result = TO_FLT4(divide_no_check(r, window_size)); | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result); | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), result); | |||
| } | |||
| __kernel void AvgPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, | |||
| const int4 input_shape, const int4 output_shape, const int2 stride, | |||
| const int2 kernel_size, const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(2); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(0); | |||
| int X = get_global_id(2); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(0); // C4 | |||
| int N = X / output_shape.y; | |||
| X = X % output_shape.y; | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| if (N >= output_shape.x || X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| @@ -55,28 +58,30 @@ __kernel void AvgPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_o | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| bool outside_y = y_c < 0 || y_c >= input_shape.y; | |||
| bool outside_y = y_c < 0 || y_c >= input_shape.z; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; | |||
| r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f); | |||
| bool outside = outside_y || x_c < 0 || x_c >= input_shape.y; | |||
| r += | |||
| !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, N * input_shape.y + x_c)) : (FLT4)(0.0f); | |||
| window_size += !outside ? 1.0f : 0.0f; | |||
| } | |||
| } | |||
| FLT4 result = TO_FLT4(divide_no_check(r, window_size)); | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(result, (FLT4)(0.f))); | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), max(result, (FLT4)(0.f))); | |||
| } | |||
| __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, | |||
| const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(2); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(0); | |||
| int X = get_global_id(2); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(0); // C4 | |||
| int N = X / output_shape.y; | |||
| X = X % output_shape.y; | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| if (N >= output_shape.x || X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| @@ -85,27 +90,28 @@ __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only i | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| if (y_c < 0 || y_c >= input_shape.y) continue; | |||
| if (y_c < 0 || y_c >= input_shape.z) continue; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| if (x_c < 0 || x_c >= input_shape.x) continue; | |||
| FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)); | |||
| if (x_c < 0 || x_c >= input_shape.y) continue; | |||
| FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, N * input_shape.y + x_c)); | |||
| maximum = max(src, maximum); | |||
| } | |||
| } | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum); | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), maximum); | |||
| } | |||
| __kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, | |||
| const int4 input_shape, const int4 output_shape, const int2 stride, | |||
| const int2 kernel_size, const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(2); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(0); | |||
| int X = get_global_id(2); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(0); // C4 | |||
| int N = X / output_shape.y; | |||
| X = X % output_shape.y; | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| if (N >= output_shape.x || X >= output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| @@ -114,13 +120,13 @@ __kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_o | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| if (y_c < 0 || y_c >= input_shape.y) continue; | |||
| if (y_c < 0 || y_c >= input_shape.z) continue; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| if (x_c < 0 || x_c >= input_shape.x) continue; | |||
| FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)); | |||
| if (x_c < 0 || x_c >= input_shape.y) continue; | |||
| FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, N * input_shape.y + x_c)); | |||
| maximum = max(src, maximum); | |||
| } | |||
| } | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f))); | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, N * output_shape.y + X), max(maximum, (FLT4)(0.f))); | |||
| } | |||
| @@ -4,16 +4,18 @@ | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| __kernel void resize_nearest_neighbor_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, | |||
| int4 in_size, int4 out_size, float2 scale_factor) { | |||
| int X = get_global_id(2); // H | |||
| int X = get_global_id(2); // H * N | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(0); // C4 | |||
| if (X >= out_size.y || Y >= out_size.z || Z >= out_size.w) { | |||
| if (X >= out_size.x * out_size.y || Y >= out_size.z || Z >= out_size.w) { | |||
| return; | |||
| } | |||
| int N = X / out_size.y; | |||
| X = X % out_size.y; | |||
| int src_x = (int)(X * scale_factor.x); | |||
| int src_y = (int)(Y * scale_factor.y); | |||
| WRITE_IMAGE(dst_data, (int2)(Y * out_size.w + Z, X), | |||
| READ_IMAGE(src_data, smp_zero, (int2)(src_y * in_size.w + Z, src_x))); | |||
| WRITE_IMAGE(dst_data, (int2)(Y * out_size.w + Z, N * out_size.y + X), | |||
| READ_IMAGE(src_data, smp_zero, (int2)(src_y * in_size.w + Z, N * in_size.y + src_x))); | |||
| } | |||
| __kernel void resize_nearest_neighbor_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, | |||
| @@ -32,25 +34,27 @@ __kernel void resize_nearest_neighbor_NC4HW4(__read_only image2d_t src_data, __w | |||
| __kernel void resize_bilinear_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 in_size, | |||
| int4 out_size, float2 scale_factor) { | |||
| int X = get_global_id(2); // H | |||
| int X = get_global_id(2); // H * N | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(0); // C4 | |||
| if (X >= out_size.y || Y >= out_size.z || Z >= out_size.w) { | |||
| if (X >= out_size.x * out_size.y || Y >= out_size.z || Z >= out_size.w) { | |||
| return; | |||
| } | |||
| int N = X / out_size.y; | |||
| X = X % out_size.y; | |||
| float scale_x = X * scale_factor.x; | |||
| float scale_y = Y * scale_factor.y; | |||
| int src_x = (int)(scale_x); | |||
| int src_y = (int)(scale_y); | |||
| int src_x_1 = min(src_x + 1, in_size.y - 1); | |||
| int src_y_1 = min(src_y + 1, in_size.z - 1); | |||
| FLT4 src0 = READ_IMAGE(src_data, smp_zero, (int2)(src_y * in_size.w + Z, src_x)); | |||
| FLT4 src1 = READ_IMAGE(src_data, smp_zero, (int2)(src_y_1 * in_size.w + Z, src_x)); | |||
| FLT4 src2 = READ_IMAGE(src_data, smp_zero, (int2)(src_y * in_size.w + Z, src_x_1)); | |||
| FLT4 src3 = READ_IMAGE(src_data, smp_zero, (int2)(src_y_1 * in_size.w + Z, src_x_1)); | |||
| FLT4 src0 = READ_IMAGE(src_data, smp_zero, (int2)(src_y * in_size.w + Z, N * in_size.y + src_x)); | |||
| FLT4 src1 = READ_IMAGE(src_data, smp_zero, (int2)(src_y_1 * in_size.w + Z, N * in_size.y + src_x)); | |||
| FLT4 src2 = READ_IMAGE(src_data, smp_zero, (int2)(src_y * in_size.w + Z, N * in_size.y + src_x_1)); | |||
| FLT4 src3 = READ_IMAGE(src_data, smp_zero, (int2)(src_y_1 * in_size.w + Z, N * in_size.y + src_x_1)); | |||
| FLT4 result = | |||
| mix(mix(src0, src1, TO_FLT(scale_y - src_y)), mix(src2, src3, TO_FLT(scale_y - src_y)), TO_FLT(scale_x - src_x)); | |||
| WRITE_IMAGE(dst_data, (int2)(Y * out_size.w + Z, X), result); | |||
| WRITE_IMAGE(dst_data, (int2)(Y * out_size.w + Z, N * out_size.y + X), result); | |||
| } | |||
| __kernel void resize_bilinear_NC4HW4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 in_size, | |||
| @@ -8,20 +8,21 @@ __kernel void SoftmaxAxis3_NHWC4(__read_only image2d_t input, __write_only image | |||
| const int4 input_shape) { | |||
| int X = get_global_id(1); // H | |||
| int Y = get_global_id(0); // W | |||
| int n = get_global_id(2); // N | |||
| int H = input_shape.y; | |||
| int W = input_shape.z; | |||
| int C4 = input_shape.w; | |||
| if (X >= H || Y >= W) return; | |||
| if (n >= input_shape.x || X >= H || Y >= W) return; | |||
| // get max | |||
| float4 last = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, X))); | |||
| float4 last = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, n * H + X))); | |||
| float input_max = last.x; | |||
| if (mask.y > 0.5f) input_max = max(input_max, last.y); | |||
| if (mask.z > 0.5f) input_max = max(input_max, last.z); | |||
| if (mask.w > 0.5f) input_max = max(input_max, last.w); | |||
| for (int d = 0; d < C4 - 1; ++d) { | |||
| float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + d, X))); | |||
| float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + d, n * H + X))); | |||
| input_max = max(input_max, t.x); | |||
| input_max = max(input_max, t.y); | |||
| input_max = max(input_max, t.z); | |||
| @@ -31,41 +32,42 @@ __kernel void SoftmaxAxis3_NHWC4(__read_only image2d_t input, __write_only image | |||
| float sum = 0.0f; | |||
| for (int d = 0; d < C4 - 1; ++d) { | |||
| float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + d, X))); | |||
| float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + d, n * H + X))); | |||
| sum += dot(exp(t - input_max_f4), (float4)(1.f)); | |||
| } | |||
| float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, X))); | |||
| float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, n * H + X))); | |||
| sum += dot(exp(min(t - input_max_f4, 0)), mask); | |||
| for (int d = 0; d < C4 - 1; ++d) { | |||
| float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + d, X))); | |||
| float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + d, n * H + X))); | |||
| result = exp(result - input_max_f4) / sum; | |||
| WRITE_IMAGE(output, (int2)(Y * C4 + d, X), TO_FLT4(result)); | |||
| WRITE_IMAGEOUT(output, (int2)(Y * C4 + d, n * H + X), OUT_FLT4(result)); | |||
| } | |||
| float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, X))); | |||
| float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * C4 + C4 - 1, n * H + X))); | |||
| result = exp(min(result - input_max_f4, 0)) / sum; | |||
| result = result * mask; | |||
| WRITE_IMAGEOUT(output, (int2)(Y * C4 + C4 - 1, X), OUT_FLT4(result)); | |||
| WRITE_IMAGEOUT(output, (int2)(Y * C4 + C4 - 1, n * H + X), OUT_FLT4(result)); | |||
| } | |||
| __kernel void SoftmaxAxis1_NHWC4(__read_only image2d_t input, __write_only image2d_t output, const float4 mask, | |||
| const int4 input_shape) { | |||
| int X = get_global_id(1); // W | |||
| int Y = get_global_id(0); // C4 | |||
| int n = get_global_id(2); // N | |||
| int H = input_shape.y; | |||
| int W = input_shape.z; | |||
| int C4 = input_shape.w; | |||
| if (X >= W || Y >= C4) return; | |||
| if (n >= input_shape.x || X >= W || Y >= C4) return; | |||
| float4 sum = 0.0f; | |||
| for (int d = 0; d < H; ++d) { | |||
| float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, d))); | |||
| float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, n * H + d))); | |||
| sum += exp(t); | |||
| } | |||
| for (int d = 0; d < H; ++d) { | |||
| float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, d))); | |||
| float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(X * C4 + Y, n * H + d))); | |||
| result = exp(result) / sum; | |||
| WRITE_IMAGEOUT(output, (int2)(X * C4 + Y, d), OUT_FLT4(result)); | |||
| WRITE_IMAGEOUT(output, (int2)(X * C4 + Y, n * H + d), OUT_FLT4(result)); | |||
| } | |||
| } | |||
| @@ -73,35 +75,38 @@ __kernel void SoftmaxAxis2_NHWC4(__read_only image2d_t input, __write_only image | |||
| const int4 input_shape) { | |||
| int X = get_global_id(1); // H | |||
| int Y = get_global_id(0); // C4 | |||
| int n = get_global_id(2); // n | |||
| int H = input_shape.y; | |||
| int W = input_shape.z; | |||
| int C4 = input_shape.w; | |||
| if (X >= H || Y >= C4) return; | |||
| if (n >= input_shape.x || X >= H || Y >= C4) return; | |||
| float4 sum = 0.0f; | |||
| for (int d = 0; d < W; ++d) { | |||
| float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, X))); | |||
| float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, n * H + X))); | |||
| sum += exp(t); | |||
| } | |||
| for (int d = 0; d < W; ++d) { | |||
| float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, X))); | |||
| float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(d * C4 + Y, n * H + X))); | |||
| result = exp(result) / sum; | |||
| WRITE_IMAGEOUT(output, (int2)(d * C4 + Y, X), OUT_FLT4(result)); | |||
| WRITE_IMAGEOUT(output, (int2)(d * C4 + Y, n * H + X), OUT_FLT4(result)); | |||
| } | |||
| } | |||
| __kernel void Softmax1x1_NHWC4(__read_only image2d_t input, __write_only image2d_t output, const float4 mask, | |||
| const int4 input_shape) { | |||
| int tid = get_local_id(0); | |||
| int n = get_global_id(1); | |||
| if (n >= input_shape.x) return; | |||
| int C4 = input_shape.w; | |||
| float sum = 0.0f; | |||
| for (size_t i = tid; i < C4 - 1; i += 32) { | |||
| float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0))); | |||
| float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, n))); | |||
| sum += dot((float4)(1.0f), exp(src)); | |||
| } | |||
| if ((C4 - 1) % 32 == tid) { | |||
| float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(C4 - 1, 0))); | |||
| float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(C4 - 1, n))); | |||
| sum += dot(convert_float4(mask), exp(src)); | |||
| } | |||
| @@ -123,8 +128,8 @@ __kernel void Softmax1x1_NHWC4(__read_only image2d_t input, __write_only image2d | |||
| barrier(CLK_LOCAL_MEM_FENCE); | |||
| sum = tmpx1[0]; | |||
| for (size_t i = tid; i < C4; i += 32) { | |||
| float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0))); | |||
| float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, n))); | |||
| result = exp(result) * sum; | |||
| WRITE_IMAGEOUT(output, (int2)(i, 0), OUT_FLT4(result)); | |||
| WRITE_IMAGEOUT(output, (int2)(i, n), OUT_FLT4(result)); | |||
| } | |||
| } | |||
| @@ -4,8 +4,11 @@ __kernel void space_to_batch_nd_NHWC4(__read_only image2d_t src_data, __write_on | |||
| int4 dst_size, int2 block_size, int4 paddings) { | |||
| int X = get_global_id(0); // c | |||
| int Y = get_global_id(1); // w | |||
| int Z = get_global_id(2); // h | |||
| if (X >= dst_size.x || Y >= dst_size.y || Y >= dst_size.z) { | |||
| int Z = get_global_id(2); // h * n_i | |||
| // (N,H*BH,W*BW,C) to (BH*BW*N,H,W,C) | |||
| int N_I = Z / dst_size.z; | |||
| Z = Z % dst_size.z; | |||
| if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z || N_I >= src_size.w) { | |||
| return; | |||
| } | |||
| for (int i = 0; i < block_size.x; ++i) { | |||
| @@ -13,8 +16,10 @@ __kernel void space_to_batch_nd_NHWC4(__read_only image2d_t src_data, __write_on | |||
| int w_org = Y * block_size.y + j - paddings.z; | |||
| int h_org = Z * block_size.x + i - paddings.x; | |||
| FLT4 res_data = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | |||
| res_data = READ_IMAGE(src_data, smp_zero, (int2)(w_org * dst_size.x + X, h_org)); | |||
| WRITE_IMAGE(dst_data, (int2)(Y * dst_size.x + X, (i * block_size.y + j) * dst_size.z + Z), res_data); | |||
| if (h_org >= 0 && h_org < src_size.z) | |||
| res_data = READ_IMAGE(src_data, smp_zero, (int2)(w_org * dst_size.x + X, N_I * src_size.z + h_org)); | |||
| WRITE_IMAGE(dst_data, (int2)(Y * dst_size.x + X, ((i * block_size.y + j) * src_size.w + N_I) * dst_size.z + Z), | |||
| res_data); | |||
| } | |||
| } | |||
| } | |||
| @@ -42,6 +42,10 @@ int PoolingOpenCLKernel::CheckSpecs() { | |||
| MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); | |||
| return RET_ERROR; | |||
| } | |||
| if (in_tensors_[0]->shape().size() != 4) { | |||
| MS_LOG(ERROR) << "Only support 4d tensor."; | |||
| return RET_ERROR; | |||
| } | |||
| if (parameter_->pool_mode_ != PoolMode_MaxPool && parameter_->pool_mode_ != PoolMode_AvgPool) { | |||
| MS_LOG(ERROR) << "Init `Pooling2d` kernel failed, unsupported pool mode!"; | |||
| return RET_ERROR; | |||
| @@ -88,7 +92,7 @@ int PoolingOpenCLKernel::Prepare() { | |||
| } | |||
| void PoolingOpenCLKernel::SetGlobalLocal() { | |||
| const size_t global_x = out_tensors_[0]->shape()[1]; | |||
| const size_t global_x = out_tensors_[0]->shape()[1] * out_tensors_[0]->shape()[0]; | |||
| const size_t global_y = out_tensors_[0]->shape()[2]; | |||
| const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); | |||
| global_size_ = {global_z, global_y, global_x}; | |||
| @@ -98,8 +102,8 @@ void PoolingOpenCLKernel::SetGlobalLocal() { | |||
| void PoolingOpenCLKernel::SetConstArgs() { | |||
| int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); | |||
| cl_int4 input_shape = {in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], in_tensors_[0]->shape()[3], slices}; | |||
| cl_int4 output_shape = {out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2], out_tensors_[0]->shape()[3], | |||
| cl_int4 input_shape = {in_tensors_[0]->shape()[0], in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], slices}; | |||
| cl_int4 output_shape = {out_tensors_[0]->shape()[0], out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2], | |||
| slices}; | |||
| cl_int2 stride = {parameter_->stride_h_, parameter_->stride_w_}; | |||
| cl_int2 kernel_size = {parameter_->window_h_, parameter_->window_w_}; | |||
| @@ -106,7 +106,7 @@ void ResizeOpenCLKernel::SetConstArgs() { | |||
| void ResizeOpenCLKernel::SetGlobalLocal() { | |||
| local_size_ = {}; | |||
| auto out_shape = GpuTensorInfo(out_tensors_[0]); | |||
| global_size_ = {out_shape.Slice, out_shape.W, out_shape.H}; | |||
| global_size_ = {out_shape.Slice, out_shape.W, out_shape.H * out_shape.N}; | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| @@ -54,10 +54,6 @@ int SoftmaxOpenCLKernel::CheckSpecs() { | |||
| MS_LOG(ERROR) << "Init Softmax kernel failed: Unsupported shape size: " << in_shape.size(); | |||
| return RET_ERROR; | |||
| } | |||
| if (in_shape[0] > 1) { | |||
| MS_LOG(ERROR) << "Init Softmax kernel failed: Unsupported multi-batch."; | |||
| return RET_ERROR; | |||
| } | |||
| if (axis_ < 0) { | |||
| axis_ = in_shape.size() + axis_; | |||
| } | |||
| @@ -104,8 +100,8 @@ int SoftmaxOpenCLKernel::Prepare() { | |||
| void SoftmaxOpenCLKernel::SetGlobalLocal() { | |||
| if (onexone_flag_) { | |||
| local_size_ = {32}; | |||
| global_size_ = {32}; | |||
| local_size_ = {32, 1}; | |||
| global_size_ = {32, out_shape_.N}; | |||
| } else { | |||
| size_t global_x, global_y; | |||
| if (axis_ == 1) { | |||
| @@ -121,7 +117,7 @@ void SoftmaxOpenCLKernel::SetGlobalLocal() { | |||
| global_x = 1; | |||
| global_y = 1; | |||
| } | |||
| global_size_ = {global_x, global_y}; | |||
| global_size_ = {global_x, global_y, out_shape_.N}; | |||
| local_size_ = {}; | |||
| } | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| @@ -85,7 +85,8 @@ void SpaceToBatchNDOpenCLKernel::SetGlobalLocal() { | |||
| size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); | |||
| cl_int4 dst_size = {(cl_int)CO4, out_tensors_[0]->Width(), out_tensors_[0]->Height(), out_tensors_[0]->Batch()}; | |||
| local_size_ = {1, 1, 1}; | |||
| global_size_ = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; | |||
| global_size_ = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], | |||
| (size_t)dst_size.s[2] * (size_t)(in_tensors_[0]->Batch())}; | |||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| @@ -239,10 +239,11 @@ int OpenCLSubGraph::Init() { | |||
| MS_ASSERT(tensor); | |||
| tensor->set_allocator(allocator_); | |||
| } | |||
| std::map<std::string, std::function<int(void)>> pass_manager{ | |||
| std::vector<std::pair<std::string, std::function<int(void)>>> pass_manager{ | |||
| {"FusionPass", std::bind(&OpenCLSubGraph::FusionPass, this)}, | |||
| {"InsertOpsPass", std::bind(&OpenCLSubGraph::InsertOpsPass, this)}, | |||
| {"UpdateTensorDataTypePass", std::bind(&OpenCLSubGraph::UpdateTensorDataTypePass, this)}, | |||
| {"FusionPass", std::bind(&OpenCLSubGraph::FusionPass, this)}}; | |||
| }; | |||
| for (auto iv : pass_manager) { | |||
| auto ret = iv.second(); | |||
| if (ret != RET_OK) { | |||
| @@ -23,7 +23,7 @@ class TestOpenCL_Pooling : public CommonTest {}; | |||
| namespace { | |||
| // PrimitiveType_Pooling: src/ops/populate/pooling_populate.cc | |||
| OpParameter *CreateParameter(PoolMode pool_mode, int window_h, int window_w, int stride_h, int stride_w, int pad_u, | |||
| int pad_d, int pad_l, int pad_r, RoundMode round_mode = RoundMode_No, | |||
| int pad_d, int pad_l, int pad_r, RoundMode round_mode = RoundMode_Floor, | |||
| ActType act_type = ActType_No) { | |||
| auto *param = test::CreateParameter<PoolingParameter>(schema::PrimitiveType_MaxPoolFusion); | |||
| param->global_ = false; | |||
| @@ -65,4 +65,27 @@ TEST_F(TestOpenCL_Pooling, Max) { | |||
| } | |||
| } | |||
| TEST_F(TestOpenCL_Pooling, AvgMultiBatch) { | |||
| std::vector<int> input_shape = {2, 2, 2, 4}; | |||
| std::vector<int> output_shape = {2, 1, 1, 4}; | |||
| float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, | |||
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; | |||
| float output_data[] = {6, 7, 8, 9, 6, 7, 8, 9}; | |||
| for (auto fp16_enable : {false, true}) { | |||
| auto *param = CreateParameter(PoolMode_AvgPool, 2, 2, 2, 2, 0, 0, 0, 0); | |||
| TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable); | |||
| } | |||
| } | |||
| TEST_F(TestOpenCL_Pooling, MaxMultiBatch) { | |||
| std::vector<int> input_shape = {2, 2, 2, 4}; | |||
| std::vector<int> output_shape = {2, 1, 1, 4}; | |||
| float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, | |||
| 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; | |||
| float output_data[] = {12, 13, 14, 15, 12, 13, 14, 15}; | |||
| for (auto fp16_enable : {false, true}) { | |||
| auto *param = CreateParameter(PoolMode_MaxPool, 2, 2, 2, 2, 0, 0, 0, 0); | |||
| TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable); | |||
| } | |||
| } | |||
| } // namespace mindspore::lite::opencl::test | |||
| @@ -83,4 +83,53 @@ TEST_F(TestOpenCL_Resize, NEAREST) { | |||
| } | |||
| } | |||
| TEST_F(TestOpenCL_Resize, BilinearBatch) { | |||
| schema::ResizeMethod method = schema::ResizeMethod_LINEAR; | |||
| int oh = 4; | |||
| int ow = 4; | |||
| bool align_corners = false; | |||
| std::vector<int> input_shape = {2, 2, 2, 1}; | |||
| std::vector<int> output_shape = {2, oh, ow, 1}; | |||
| float input_data[] = {0, 1, 2, 3, 0, 1, 2, 3}; | |||
| float output_data[] = {0, 0.5, 1, 1, 1, 1.5, 2, 2, 2, 2.5, 3, 3, 2, 2.5, 3, 3, | |||
| 0, 0.5, 1, 1, 1, 1.5, 2, 2, 2, 2.5, 3, 3, 2, 2.5, 3, 3}; | |||
| for (auto fp16_enable : {false, true}) { | |||
| auto *param = CreateParameter(method, oh, ow, align_corners); | |||
| TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable); | |||
| } | |||
| } | |||
| TEST_F(TestOpenCL_Resize, Bilinear_AlignCornersBatch) { | |||
| schema::ResizeMethod method = schema::ResizeMethod_LINEAR; | |||
| int oh = 3; | |||
| int ow = 3; | |||
| bool align_corners = true; | |||
| std::vector<int> input_shape = {2, 2, 2, 1}; | |||
| std::vector<int> output_shape = {2, oh, ow, 1}; | |||
| float input_data[] = {0, 1, 2, 3, 0, 1, 2, 3}; | |||
| float output_data[] = {0, 0.5, 1, 1, 1.5, 2, 2, 2.5, 3, 0, 0.5, 1, 1, 1.5, 2, 2, 2.5, 3}; | |||
| for (auto fp16_enable : {false, true}) { | |||
| auto *param = CreateParameter(method, oh, ow, align_corners); | |||
| TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable); | |||
| } | |||
| } | |||
| TEST_F(TestOpenCL_Resize, NEARESTBatch) { | |||
| schema::ResizeMethod method = schema::ResizeMethod_NEAREST; | |||
| int oh = 4; | |||
| int ow = 4; | |||
| bool align_corners = false; | |||
| std::vector<int> input_shape = {2, 2, 2, 1}; | |||
| std::vector<int> output_shape = {2, oh, ow, 1}; | |||
| float input_data[] = {0, 1, 2, 3, 0, 1, 2, 3}; | |||
| float output_data[] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3, | |||
| 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3}; | |||
| for (auto fp16_enable : {false, true}) { | |||
| auto *param = CreateParameter(method, oh, ow, align_corners); | |||
| TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable); | |||
| } | |||
| } | |||
| } // namespace mindspore::lite::opencl::test | |||
| @@ -71,4 +71,47 @@ TEST_F(TestOpenCL_SoftMax, 4D_axis1) { | |||
| } | |||
| } | |||
| TEST_F(TestOpenCL_SoftMax, 2D_axis1_N) { | |||
| int axis = 1; | |||
| std::vector<int> input_shape = {2, 10}; | |||
| std::vector<int> output_shape = input_shape; | |||
| float input_data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; | |||
| float output_data[] = {0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, | |||
| 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1}; | |||
| for (auto fp16_enable : {false, true}) { | |||
| auto *param = CreateParameter(axis); | |||
| TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable, | |||
| fp16_enable ? 2e-2 : 1e-5); | |||
| } | |||
| } | |||
| TEST_F(TestOpenCL_SoftMax, 4D_axis3_N) { | |||
| int axis = 3; | |||
| std::vector<int> input_shape = {2, 2, 1, 5}; | |||
| std::vector<int> output_shape = input_shape; | |||
| float input_data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; | |||
| float output_data[] = {0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, | |||
| 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2}; | |||
| for (auto fp16_enable : {false, true}) { | |||
| auto *param = CreateParameter(axis); | |||
| TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable, | |||
| fp16_enable ? 2e-2 : 1e-5); | |||
| } | |||
| } | |||
| TEST_F(TestOpenCL_SoftMax, 4D_axis1_N) { | |||
| int axis = 1; | |||
| std::vector<int> input_shape = {2, 2, 1, 1}; | |||
| std::vector<int> output_shape = input_shape; | |||
| float input_data[] = {1, 1, 1, 1}; | |||
| float output_data[] = {0.5, 0.5, 0.5, 0.5}; | |||
| for (auto fp16_enable : {false, true}) { | |||
| auto *param = CreateParameter(axis); | |||
| TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable, | |||
| fp16_enable ? 2e-2 : 1e-5); | |||
| } | |||
| } | |||
| } // namespace mindspore::lite::opencl::test | |||
| @@ -32,6 +32,7 @@ OpParameter *CreateParameter(const std::vector<int> &block_sizes, const std::vec | |||
| for (int i = 0; i < paddings.size(); ++i) { | |||
| param->paddings_[i] = paddings[i]; | |||
| } | |||
| param->m_ = 2; | |||
| return reinterpret_cast<OpParameter *>(param); | |||
| } | |||
| @@ -83,4 +84,27 @@ TEST_F(TestOpenCL_SpaceToBatch, H2W2Pad2222) { | |||
| TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable); | |||
| } | |||
| } | |||
| TEST_F(TestOpenCL_SpaceToBatch, H2W2Pad2222MultiBatch) { | |||
| std::vector<int> input_shape{2, 6, 6, 1}; | |||
| std::vector<int> block_sizes = {2, 2}; | |||
| std::vector<int> paddings = {2, 2, 2, 2}; | |||
| auto output_shape = InferShape(input_shape, block_sizes, paddings); | |||
| float input_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, | |||
| 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, | |||
| 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71}; | |||
| float output_data[] = {0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 12, 14, 16, 0, 0, 24, 26, 28, 0, 0, 0, 0, 0, 0, | |||
| 0, 0, 0, 0, 0, 0, 36, 38, 40, 0, 0, 48, 50, 52, 0, 0, 60, 62, 64, 0, 0, 0, 0, 0, 0, | |||
| 0, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0, 13, 15, 17, 0, 0, 25, 27, 29, 0, 0, 0, 0, 0, 0, | |||
| 0, 0, 0, 0, 0, 0, 37, 39, 41, 0, 0, 49, 51, 53, 0, 0, 61, 63, 65, 0, 0, 0, 0, 0, 0, | |||
| 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 18, 20, 22, 0, 0, 30, 32, 34, 0, 0, 0, 0, 0, 0, | |||
| 0, 0, 0, 0, 0, 0, 42, 44, 46, 0, 0, 54, 56, 58, 0, 0, 66, 68, 70, 0, 0, 0, 0, 0, 0, | |||
| 0, 0, 0, 0, 0, 0, 7, 9, 11, 0, 0, 19, 21, 23, 0, 0, 31, 33, 35, 0, 0, 0, 0, 0, 0, | |||
| 0, 0, 0, 0, 0, 0, 43, 45, 47, 0, 0, 55, 57, 59, 0, 0, 67, 69, 71, 0, 0, 0, 0, 0, 0}; | |||
| for (auto fp16_enable : {false, true}) { | |||
| auto *param = CreateParameter(block_sizes, paddings); | |||
| TestMain({{input_shape, input_data, VAR}}, {output_shape, output_data}, param, fp16_enable); | |||
| } | |||
| } | |||
| } // namespace mindspore::lite::opencl::test | |||