From: @chenzupeng Reviewed-by: Signed-off-by: @ddwskytags/v1.1.0
| @@ -1,36 +0,0 @@ | |||
| #ifdef cl_khr_fp16 | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| #endif | |||
| #define divide_no_check(a, b) (a / b) | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, | |||
| const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(2); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(0); | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 r = (FLT4)(0.0f); | |||
| FLT window_size = 0.0f; | |||
| int xs = X * stride.x - padding.x; | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| bool outside_y = y_c < 0 || y_c >= input_shape.y; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; | |||
| r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f); | |||
| window_size += !outside ? 1.0f : 0.0f; | |||
| } | |||
| } | |||
| FLT4 result = TO_FLT4(divide_no_check(r, window_size)); | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result); | |||
| } | |||
| @@ -1,119 +1,7 @@ | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| __kernel void conv2d_transpose2x2_NHWC4(__read_only image2d_t src_data, __global FLT16 *weight, | |||
| __read_only image2d_t biases, __write_only image2d_t dst_data, int2 kernel_size, | |||
| int2 stride, int2 padding, int4 src_size, int4 dst_size) { | |||
| int h = get_global_id(0); | |||
| int kh = h % 2; | |||
| int src_h = h / 2; | |||
| src_h = src_h * 2; | |||
| int w = get_global_id(1); | |||
| int kw = w % 2; | |||
| int src_w = w / 2; | |||
| src_w = src_w * 2; | |||
| int co = get_global_id(2); | |||
| if (src_h * 2 >= dst_size.x || src_w * 2 >= dst_size.y || co >= dst_size.z) return; | |||
| FLT4 r0 = (FLT4)(0.f); | |||
| FLT4 r1 = (FLT4)(0.f); | |||
| FLT4 r2 = (FLT4)(0.f); | |||
| FLT4 r3 = (FLT4)(0.f); | |||
| int base_w = (co * 4 + kh * 2 + kw) * src_size.z; | |||
| for (int ci = 0; ci < src_size.z; ++ci) { | |||
| FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h)); | |||
| FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(src_w * src_size.z + ci, src_h + 1)); | |||
| FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h)); | |||
| FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)((src_w + 1) * src_size.z + ci, src_h + 1)); | |||
| FLT16 weight_cache = weight[base_w++]; | |||
| r0 += x0.x * weight_cache.s0123; | |||
| r0 += x0.y * weight_cache.s4567; | |||
| r0 += x0.z * weight_cache.s89ab; | |||
| r0 += x0.w * weight_cache.scdef; | |||
| r1 += x1.x * weight_cache.s0123; | |||
| r1 += x1.y * weight_cache.s4567; | |||
| r1 += x1.z * weight_cache.s89ab; | |||
| r1 += x1.w * weight_cache.scdef; | |||
| r2 += x2.x * weight_cache.s0123; | |||
| r2 += x2.y * weight_cache.s4567; | |||
| r2 += x2.z * weight_cache.s89ab; | |||
| r2 += x2.w * weight_cache.scdef; | |||
| r3 += x3.x * weight_cache.s0123; | |||
| r3 += x3.y * weight_cache.s4567; | |||
| r3 += x3.z * weight_cache.s89ab; | |||
| r3 += x3.w * weight_cache.scdef; | |||
| } | |||
| FLT4 bias_val = READ_IMAGE(biases, smp_zero, (int2)(co, 0)); | |||
| r0 += bias_val; | |||
| r1 += bias_val; | |||
| r2 += bias_val; | |||
| r3 += bias_val; | |||
| WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh), r0); | |||
| WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw) * dst_size.z + co, 2 * src_h + kh + 2), r1); | |||
| WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh), r2); | |||
| WRITE_IMAGE(dst_data, (int2)((2 * src_w + kw + 2) * dst_size.z + co, 2 * src_h + kh + 2), r3); | |||
| } | |||
| __kernel void conv2d_transpose2x2_NC4HW4(__read_only image2d_t src_data, __global FLT16 *weight, | |||
| __read_only image2d_t biases, __write_only image2d_t dst_data, | |||
| int2 kernel_size, int2 stride, int2 padding, int4 src_size, int4 dst_size) { | |||
| int h = get_global_id(0); | |||
| int kh = h % 2; | |||
| int src_h = h / 2; | |||
| src_h = src_h * 2; | |||
| int w = get_global_id(1); | |||
| int kw = w % 2; | |||
| int src_w = w / 2; | |||
| src_w = src_w * 2; | |||
| int co = get_global_id(2); | |||
| if (src_h * 2 >= dst_size.x || src_w * 2 >= dst_size.y || co >= dst_size.z) return; | |||
| FLT4 r0 = (FLT4)(0.f); | |||
| FLT4 r1 = (FLT4)(0.f); | |||
| FLT4 r2 = (FLT4)(0.f); | |||
| FLT4 r3 = (FLT4)(0.f); | |||
| int base_w = (co * 4 + kh * 2 + kw) * src_size.z; | |||
| for (int ci = 0; ci < src_size.z; ++ci) { | |||
| FLT4 x0 = READ_IMAGE(src_data, smp_zero, (int2)(src_w, ci * src_size.x + src_h)); | |||
| FLT4 x1 = READ_IMAGE(src_data, smp_zero, (int2)(src_w, ci * src_size.x + src_h + 1)); | |||
| FLT4 x2 = READ_IMAGE(src_data, smp_zero, (int2)(src_w + 1, ci * src_size.x + src_h)); | |||
| FLT4 x3 = READ_IMAGE(src_data, smp_zero, (int2)(src_w + 1, ci * src_size.x + src_h + 1)); | |||
| FLT16 weight_cache = weight[base_w++]; | |||
| r0 += x0.x * weight_cache.s0123; | |||
| r0 += x0.y * weight_cache.s4567; | |||
| r0 += x0.z * weight_cache.s89ab; | |||
| r0 += x0.w * weight_cache.scdef; | |||
| r1 += x1.x * weight_cache.s0123; | |||
| r1 += x1.y * weight_cache.s4567; | |||
| r1 += x1.z * weight_cache.s89ab; | |||
| r1 += x1.w * weight_cache.scdef; | |||
| r2 += x2.x * weight_cache.s0123; | |||
| r2 += x2.y * weight_cache.s4567; | |||
| r2 += x2.z * weight_cache.s89ab; | |||
| r2 += x2.w * weight_cache.scdef; | |||
| r3 += x3.x * weight_cache.s0123; | |||
| r3 += x3.y * weight_cache.s4567; | |||
| r3 += x3.z * weight_cache.s89ab; | |||
| r3 += x3.w * weight_cache.scdef; | |||
| } | |||
| FLT4 bias_val = READ_IMAGE(biases, smp_zero, (int2)(co, 0)); | |||
| r0 += bias_val; | |||
| r1 += bias_val; | |||
| r2 += bias_val; | |||
| r3 += bias_val; | |||
| WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw, co * dst_size.x + 2 * src_h + kh), r0); | |||
| WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw, co * dst_size.x + 2 * src_h + kh + 2), r1); | |||
| WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw + 2, co * dst_size.x + 2 * src_h + kh), r2); | |||
| WRITE_IMAGE(dst_data, (int2)(2 * src_w + kw + 2, co * dst_size.x + 2 * src_h + kh + 2), r3); | |||
| } | |||
| __kernel void conv2d_transpose_NHWC4(__read_only image2d_t src_data, __global FLT16 *weight, | |||
| __read_only image2d_t biases, __write_only image2d_t dst_data, int2 kernel_size, | |||
| __kernel void conv2d_transpose_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, | |||
| __global FLT16 *weight, __read_only image2d_t biases, int2 kernel_size, | |||
| int2 stride, int2 padding, int4 src_size, int4 dst_size) { | |||
| int dst_h = get_global_id(0); | |||
| int rem_h = dst_h % stride.x; | |||
| @@ -2,8 +2,8 @@ | |||
| #define C4NUM 4 | |||
| #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| __kernel void FullConnection_NHWC4(__read_only image2d_t input, __global FLT16 *weight, __read_only image2d_t bias, | |||
| __write_only image2d_t output, int4 in_shape, int2 out_shape, float act_min, | |||
| __kernel void FullConnection_NHWC4(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, | |||
| __read_only image2d_t bias, int4 in_shape, int2 out_shape, float act_min, | |||
| float act_max) { | |||
| int gidx = get_global_id(0); // CO4 | |||
| int gidz = get_global_id(2); // N | |||
| @@ -2,7 +2,7 @@ | |||
| #define C4NUM 4 | |||
| #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| __kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output, | |||
| __kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, | |||
| int4 in_shape, int4 out_shape) { | |||
| int gidx = get_global_id(0); // CO4 | |||
| int gidz = get_global_id(2); // N | |||
| @@ -32,37 +32,7 @@ __kernel void MatMul_NHWC4_2d(__read_only image2d_t input, __global FLT16 *weigh | |||
| } | |||
| } | |||
| __kernel void MatMul_NC4HW4_2d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output, | |||
| int4 in_shape, int4 out_shape) { | |||
| int gidx = get_global_id(0); // CO4 | |||
| int gidz = get_global_id(2); // N | |||
| int lidx = get_local_id(0); | |||
| int lidy = get_local_id(1); | |||
| int ci4 = UP_DIV(in_shape.w, C4NUM); | |||
| int co4 = UP_DIV(out_shape.w, C4NUM); | |||
| int n = out_shape.z; | |||
| bool inside = gidx < co4 && gidz < n; | |||
| FLT4 result = (FLT4)(0.0f); | |||
| for (uint i = lidy; i < ci4 && inside; i += 4) { | |||
| FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz * ci4 + i, 0)); | |||
| FLT16 w = weight[i * co4 + gidx]; | |||
| result.x += dot(v, w.s0123); | |||
| result.y += dot(v, w.s4567); | |||
| result.z += dot(v, w.s89ab); | |||
| result.w += dot(v, w.scdef); | |||
| } | |||
| __local FLT4 temp[32][4]; | |||
| temp[lidx][lidy] = result; | |||
| barrier(CLK_LOCAL_MEM_FENCE); | |||
| if (lidy == 0 && inside) { | |||
| result += temp[lidx][1]; | |||
| result += temp[lidx][2]; | |||
| result += temp[lidx][3]; | |||
| WRITE_IMAGE(output, (int2)(0, gidz * co4 + gidx), result); | |||
| } | |||
| } | |||
| __kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output, | |||
| __kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, | |||
| int4 in_shape, int4 out_shape) { | |||
| int gidx = get_global_id(0); // CO4 | |||
| int gidy = get_global_id(1); // N * H * 4 | |||
| @@ -95,39 +65,3 @@ __kernel void MatMul_NHWC4_4d(__read_only image2d_t input, __global FLT16 *weigh | |||
| WRITE_IMAGE(output, (int2)(gidz * co4 + gidx, nh_index), result); | |||
| } | |||
| } | |||
| __kernel void MatMul_NC4HW4_4d(__read_only image2d_t input, __global FLT16 *weight, __write_only image2d_t output, | |||
| int4 in_shape, int4 out_shape) { | |||
| int gidx = get_global_id(0); // CO4 | |||
| int gidy = get_global_id(1); // N * H * 4 | |||
| int gidz = get_global_id(2); // W | |||
| int lidx = get_local_id(0); | |||
| int lidy = get_local_id(1); | |||
| int ci4 = UP_DIV(in_shape.w, C4NUM); | |||
| int co4 = UP_DIV(out_shape.w, C4NUM); | |||
| int n = out_shape.x; | |||
| int h = out_shape.y; | |||
| int w = out_shape.z; | |||
| int nh_index = gidy / 4; | |||
| bool inside = gidx < co4 && gidz < w && nh_index < n * h; | |||
| int n_index = nh_index / h; | |||
| int h_index = nh_index % h; | |||
| FLT4 result = (FLT4)(0.0f); | |||
| for (uint i = lidy; i < ci4 && inside; i += 4) { | |||
| FLT4 v = READ_IMAGE(input, smp_zero, (int2)(gidz, n_index * ci4 * h + i * h + h_index)); | |||
| FLT16 weight_value = weight[nh_index * ci4 * co4 + i * co4 + gidx]; | |||
| result.x += dot(v, weight_value.s0123); | |||
| result.y += dot(v, weight_value.s4567); | |||
| result.z += dot(v, weight_value.s89ab); | |||
| result.w += dot(v, weight_value.scdef); | |||
| } | |||
| __local FLT4 temp[32][4]; | |||
| temp[lidx][lidy] = result; | |||
| barrier(CLK_LOCAL_MEM_FENCE); | |||
| if (lidy == 0 && inside) { | |||
| result += temp[lidx][1]; | |||
| result += temp[lidx][2]; | |||
| result += temp[lidx][3]; | |||
| WRITE_IMAGE(output, (int2)(gidz, n_index * co4 * h + gidx * h + h_index), result); | |||
| } | |||
| } | |||
| @@ -1,61 +0,0 @@ | |||
| #ifdef cl_khr_fp16 | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| #endif | |||
| __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | |||
| __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, | |||
| const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(2); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(0); | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 maximum = (FLT4)(-10000.0f); | |||
| int xs = X * stride.x - padding.x; | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| if (y_c < 0 || y_c >= input_shape.y) continue; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| if (x_c < 0 || x_c >= input_shape.x) continue; | |||
| FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c)); | |||
| maximum = max(src, maximum); | |||
| } | |||
| } | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum); | |||
| } | |||
| __kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, | |||
| const int4 input_shape, const int4 output_shape, const int2 stride, | |||
| const int2 kernel_size, const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(2); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(0); | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 maximum = (FLT4)(-10000.0f); | |||
| int xs = X * stride.x - padding.x; | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| if (y_c < 0 || y_c >= input_shape.y) continue; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| if (x_c < 0 || x_c >= input_shape.x) continue; | |||
| FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c)); | |||
| maximum = max(src, maximum); | |||
| } | |||
| } | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f))); | |||
| } | |||
| @@ -0,0 +1,126 @@ | |||
| #ifdef cl_khr_fp16 | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| #endif | |||
| #define divide_no_check(a, b) (a / b) | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| __kernel void AvgPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, | |||
| const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(2); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(0); | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 r = (FLT4)(0.0f); | |||
| FLT window_size = 0.0f; | |||
| int xs = X * stride.x - padding.x; | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| bool outside_y = y_c < 0 || y_c >= input_shape.y; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; | |||
| r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f); | |||
| window_size += !outside ? 1.0f : 0.0f; | |||
| } | |||
| } | |||
| FLT4 result = TO_FLT4(divide_no_check(r, window_size)); | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result); | |||
| } | |||
| __kernel void AvgPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, | |||
| const int4 input_shape, const int4 output_shape, const int2 stride, | |||
| const int2 kernel_size, const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(2); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(0); | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 r = (FLT4)(0.0f); | |||
| FLT window_size = 0.0f; | |||
| int xs = X * stride.x - padding.x; | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| bool outside_y = y_c < 0 || y_c >= input_shape.y; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; | |||
| r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (FLT4)(0.0f); | |||
| window_size += !outside ? 1.0f : 0.0f; | |||
| } | |||
| } | |||
| FLT4 result = TO_FLT4(divide_no_check(r, window_size)); | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(result, (FLT4)(0.f))); | |||
| } | |||
| __kernel void MaxPooling2d_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, | |||
| const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(2); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(0); | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 maximum = (FLT4)(-10000.0f); | |||
| int xs = X * stride.x - padding.x; | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| if (y_c < 0 || y_c >= input_shape.y) continue; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| if (x_c < 0 || x_c >= input_shape.x) continue; | |||
| FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)); | |||
| maximum = max(src, maximum); | |||
| } | |||
| } | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum); | |||
| } | |||
| __kernel void MaxPooling2d_ReLU_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, | |||
| const int4 input_shape, const int4 output_shape, const int2 stride, | |||
| const int2 kernel_size, const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(2); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(0); | |||
| // boundary check | |||
| if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 maximum = (FLT4)(-10000.0f); | |||
| int xs = X * stride.x - padding.x; | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| if (y_c < 0 || y_c >= input_shape.y) continue; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| if (x_c < 0 || x_c >= input_shape.x) continue; | |||
| FLT4 src = READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)); | |||
| maximum = max(src, maximum); | |||
| } | |||
| } | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), max(maximum, (FLT4)(0.f))); | |||
| } | |||
| @@ -42,30 +42,41 @@ using mindspore::schema::PrimitiveType_Activation; | |||
| namespace mindspore::kernel { | |||
| int ActivationOpenClKernel::Init() { | |||
| std::map<int, std::string> kernel_names{ | |||
| std::string ActivationOpenCLKernel::GetActTypeString(int act_type) { | |||
| static std::map<int, std::string> supported_act_type = { | |||
| {ActivationType_LEAKY_RELU, "LeakyRelu"}, {ActivationType_RELU, "Relu"}, {ActivationType_SIGMOID, "Sigmoid"}, | |||
| {ActivationType_RELU6, "Relu6"}, {ActivationType_TANH, "Tanh"}, {ActivationType_SWISH, "Swish"}, | |||
| {ActivationType_HSWISH, "HSwish"}}; | |||
| if (kernel_names.count(type_) == 0) { | |||
| auto result_iter = supported_act_type.find(act_type); | |||
| if (result_iter != supported_act_type.end()) { | |||
| return result_iter->second; | |||
| } | |||
| return ""; | |||
| } | |||
| int ActivationOpenCLKernel::CheckSpecs() { | |||
| if (GetActTypeString(type_).empty()) { | |||
| MS_LOG(ERROR) << "schema::ActivationType:" << type_ << "not found"; | |||
| return mindspore::lite::RET_ERROR; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int ActivationOpenCLKernel::Prepare() { | |||
| outShape = Image2DInfo(out_tensors_[0]); | |||
| local_size_ = {}; | |||
| global_size_ = {outShape.width, outShape.height}; | |||
| std::string source = activation_source; | |||
| std::set<std::string> build_options; | |||
| std::string program_name = "Activation"; | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| std::string kernel_name = kernel_names[type_]; | |||
| std::string kernel_name = GetActTypeString(type_); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| SetArgs(); | |||
| SetConstArgs(); | |||
| SetGlobalLocal(); | |||
| MS_LOG(DEBUG) << kernel_name << " init Done!"; | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| int ActivationOpenClKernel::SetArgs() { | |||
| void ActivationOpenCLKernel::SetConstArgs() { | |||
| int arg_idx = 2; | |||
| cl_int2 image_size = {static_cast<int>(outShape.width), static_cast<int>(outShape.height)}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, image_size); | |||
| @@ -78,50 +89,26 @@ int ActivationOpenClKernel::SetArgs() { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, c4); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, last_c4); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int ActivationOpenClKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " begin running!"; | |||
| void ActivationOpenCLKernel::SetGlobalLocal() { | |||
| local_range_ = cl::NullRange; | |||
| global_range_ = {outShape.width, outShape.height}; | |||
| } | |||
| int ActivationOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| auto ret = ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | |||
| if (ret != mindspore::lite::RET_OK) { | |||
| auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Run kernel:" << this->name() << " fail."; | |||
| return mindspore::lite::RET_ERROR; | |||
| return RET_ERROR; | |||
| } | |||
| return mindspore::lite::RET_OK; | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenClActivationKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| if (inputs.empty()) { | |||
| MS_LOG(ERROR) << "Input data size must be greater than 0, but your size is " << inputs.size(); | |||
| return nullptr; | |||
| } | |||
| if (inputs[0]->shape().size() > 2 && inputs[0]->shape()[0] > 1) { | |||
| MS_LOG(ERROR) << "Activation kernel:" << opParameter->name_ << " failed: Unsupported multi-batch."; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto *kernel = | |||
| new (std::nothrow) ActivationOpenClKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "New kernel:" << opParameter->name_ << "is nullptr."; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != mindspore::lite::RET_OK) { | |||
| MS_LOG(ERROR) << "Init activation kernel:" << opParameter->name_ << " failed!"; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Activation, OpenClActivationKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Activation, OpenClActivationKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Activation, OpenCLKernelCreator<ActivationOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Activation, OpenCLKernelCreator<ActivationOpenCLKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -18,26 +18,30 @@ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ACTIVATION_H_ | |||
| #include <vector> | |||
| #include <string> | |||
| #include "src/runtime/kernel/opencl/opencl_kernel.h" | |||
| #include "nnacl/fp32/activation.h" | |||
| namespace mindspore::kernel { | |||
| class ActivationOpenClKernel : public OpenCLKernel { | |||
| class ActivationOpenCLKernel : public OpenCLKernel { | |||
| public: | |||
| ActivationOpenClKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| ActivationOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs) | |||
| : OpenCLKernel(parameter, inputs, outputs), | |||
| type_(reinterpret_cast<ActivationParameter *>(parameter)->type_), | |||
| alpha_(reinterpret_cast<ActivationParameter *>(parameter)->alpha_) {} | |||
| ~ActivationOpenClKernel() override = default; | |||
| ~ActivationOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Run() override; | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| int SetArgs(); | |||
| static std::string GetActTypeString(int act_type); | |||
| cl::Kernel kernel_; | |||
| int type_; | |||
| float alpha_; | |||
| @@ -35,41 +35,111 @@ using mindspore::schema::PrimitiveType_Eltwise; | |||
| namespace mindspore::kernel { | |||
| std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const { | |||
| auto out_shape = out_tensors_[0]->shape(); | |||
| if (out_shape.size() == 2) { | |||
| const size_t global_x = 1; | |||
| const size_t global_y = 1; | |||
| const size_t global_z = UP_ROUND_DIV(out_shape[1], C4NUM); | |||
| std::vector<size_t> global = {global_x, global_y, global_z}; | |||
| return global; | |||
| int ArithmeticOpenCLKernel::CheckSpecs() { | |||
| auto *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_); | |||
| if (arithmetic_parameter->broadcasting_) { | |||
| element_flag_ = false; | |||
| kernel_name_ = "BroadcastNHWC4"; | |||
| if (out_tensors_[0]->shape()[0] > 1) { | |||
| MS_LOG(ERROR) << "Broadcasting don't support N > 1"; | |||
| return RET_ERROR; | |||
| } | |||
| } else { | |||
| const size_t global_x = out_shape[2]; | |||
| const size_t global_y = out_shape[1]; | |||
| const size_t global_z = UP_ROUND_DIV(out_shape[3], C4NUM); | |||
| std::vector<size_t> global = {global_x, global_y, global_z}; | |||
| return global; | |||
| kernel_name_ = "Element"; | |||
| } | |||
| switch (op_parameter_->type_) { | |||
| case PrimitiveType_Mul: | |||
| kernel_name_ += "Mul"; | |||
| break; | |||
| case PrimitiveType_Add: | |||
| kernel_name_ += "Add"; | |||
| break; | |||
| case PrimitiveType_Sub: | |||
| kernel_name_ += "Sub"; | |||
| break; | |||
| case PrimitiveType_Div: | |||
| kernel_name_ += "Div"; | |||
| break; | |||
| case PrimitiveType_LogicalAnd: | |||
| kernel_name_ += "And"; | |||
| break; | |||
| case PrimitiveType_LogicalOr: | |||
| kernel_name_ += "Or"; | |||
| break; | |||
| case PrimitiveType_Maximum: | |||
| kernel_name_ += "Max"; | |||
| break; | |||
| case PrimitiveType_Minimum: | |||
| kernel_name_ += "Min"; | |||
| break; | |||
| case PrimitiveType_FloorDiv: | |||
| kernel_name_ += "FloorDiv"; | |||
| break; | |||
| case PrimitiveType_FloorMod: | |||
| kernel_name_ += "FloorMod"; | |||
| break; | |||
| case PrimitiveType_SquaredDifference: | |||
| kernel_name_ += "SquaredDifference"; | |||
| break; | |||
| case PrimitiveType_Equal: | |||
| kernel_name_ += "Equal"; | |||
| break; | |||
| case PrimitiveType_NotEqual: | |||
| kernel_name_ += "NotEqual"; | |||
| break; | |||
| case PrimitiveType_Less: | |||
| kernel_name_ += "Less"; | |||
| break; | |||
| case PrimitiveType_LessEqual: | |||
| kernel_name_ += "LessEqual"; | |||
| break; | |||
| case PrimitiveType_Greater: | |||
| kernel_name_ += "Greater"; | |||
| break; | |||
| case PrimitiveType_GreaterEqual: | |||
| kernel_name_ += "GreaterEqual"; | |||
| break; | |||
| default: | |||
| MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_; | |||
| return RET_ERROR; | |||
| } | |||
| switch (arithmetic_parameter->activation_type_) { | |||
| case schema::ActivationType_NO_ACTIVATION: | |||
| break; | |||
| case schema::ActivationType_RELU: | |||
| activation_min_ = 0.f; | |||
| break; | |||
| case schema::ActivationType_RELU6: | |||
| activation_min_ = 0.f; | |||
| activation_max_ = 6.f; | |||
| break; | |||
| default: | |||
| MS_LOG(ERROR) << "Unsupported activation type " << arithmetic_parameter->activation_type_; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() { | |||
| void ArithmeticOpenCLKernel::SetGlobalLocal() { | |||
| if (element_flag_) { | |||
| local_size_ = {16, 16}; | |||
| local_range_ = {}; | |||
| auto out_shape = out_tensors_[0]->shape(); | |||
| if (out_shape.size() == 2) { | |||
| size_t H = out_shape[0]; | |||
| size_t W = UP_DIV(out_shape[1], C4NUM); | |||
| global_size_ = {W, H}; | |||
| global_range_ = {W, H}; | |||
| } else { | |||
| size_t H = out_shape[0] * out_shape[1]; | |||
| size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM); | |||
| global_size_ = {W, H}; | |||
| global_range_ = {W, H}; | |||
| } | |||
| } else { | |||
| local_size_ = {}; | |||
| local_range_ = {}; | |||
| auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); | |||
| global_size_ = {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]), | |||
| static_cast<size_t>(out_shape[1] * out_shape[0])}; | |||
| global_range_ = {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]), | |||
| static_cast<size_t>(out_shape[1] * out_shape[0])}; | |||
| } | |||
| } | |||
| @@ -137,7 +207,7 @@ int ArithmeticOpenCLKernel::InitWeights() { | |||
| return RET_OK; | |||
| } | |||
| int ArithmeticOpenCLKernel::SetArgs() { | |||
| void ArithmeticOpenCLKernel::SetConstArgs() { | |||
| int arg_idx = 3; | |||
| if (!element_flag_) { | |||
| cl_int4 input0_shape = {inputs_nhwc_shapes_[0][0], inputs_nhwc_shapes_[0][1], inputs_nhwc_shapes_[0][2], | |||
| @@ -157,124 +227,37 @@ int ArithmeticOpenCLKernel::SetArgs() { | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, broadcastC_flag); | |||
| } else { | |||
| cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])}; | |||
| cl_int2 output_shape{static_cast<int>(global_range_[0]), static_cast<int>(global_range_[1])}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_); | |||
| return RET_OK; | |||
| } | |||
| int ArithmeticOpenCLKernel::Init() { | |||
| std::string kernel_name; | |||
| auto *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_); | |||
| if (arithmetic_parameter->broadcasting_) { | |||
| element_flag_ = false; | |||
| kernel_name = "BroadcastNHWC4"; | |||
| if (out_tensors_[0]->shape()[0] > 1) { | |||
| MS_LOG(ERROR) << "Broadcasting don't support N > 1"; | |||
| return RET_ERROR; | |||
| } | |||
| } else { | |||
| kernel_name = "Element"; | |||
| } | |||
| switch (op_parameter_->type_) { | |||
| case PrimitiveType_Mul: | |||
| kernel_name += "Mul"; | |||
| break; | |||
| case PrimitiveType_Add: | |||
| kernel_name += "Add"; | |||
| break; | |||
| case PrimitiveType_Sub: | |||
| kernel_name += "Sub"; | |||
| break; | |||
| case PrimitiveType_Div: | |||
| kernel_name += "Div"; | |||
| break; | |||
| case PrimitiveType_LogicalAnd: | |||
| kernel_name += "And"; | |||
| break; | |||
| case PrimitiveType_LogicalOr: | |||
| kernel_name += "Or"; | |||
| break; | |||
| case PrimitiveType_Maximum: | |||
| kernel_name += "Max"; | |||
| break; | |||
| case PrimitiveType_Minimum: | |||
| kernel_name += "Min"; | |||
| break; | |||
| case PrimitiveType_FloorDiv: | |||
| kernel_name += "FloorDiv"; | |||
| break; | |||
| case PrimitiveType_FloorMod: | |||
| kernel_name += "FloorMod"; | |||
| break; | |||
| case PrimitiveType_SquaredDifference: | |||
| kernel_name += "SquaredDifference"; | |||
| break; | |||
| case PrimitiveType_Equal: | |||
| kernel_name += "Equal"; | |||
| break; | |||
| case PrimitiveType_NotEqual: | |||
| kernel_name += "NotEqual"; | |||
| break; | |||
| case PrimitiveType_Less: | |||
| kernel_name += "Less"; | |||
| break; | |||
| case PrimitiveType_LessEqual: | |||
| kernel_name += "LessEqual"; | |||
| break; | |||
| case PrimitiveType_Greater: | |||
| kernel_name += "Greater"; | |||
| break; | |||
| case PrimitiveType_GreaterEqual: | |||
| kernel_name += "GreaterEqual"; | |||
| break; | |||
| default: | |||
| MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_; | |||
| return RET_ERROR; | |||
| } | |||
| switch (arithmetic_parameter->activation_type_) { | |||
| case schema::ActivationType_NO_ACTIVATION: | |||
| break; | |||
| case schema::ActivationType_RELU: | |||
| activation_min_ = 0.f; | |||
| break; | |||
| case schema::ActivationType_RELU6: | |||
| activation_min_ = 0.f; | |||
| activation_max_ = 6.f; | |||
| break; | |||
| default: | |||
| MS_LOG(ERROR) << "Unsupported activation type " << arithmetic_parameter->activation_type_; | |||
| return RET_ERROR; | |||
| } | |||
| int ArithmeticOpenCLKernel::Prepare() { | |||
| lite::STATUS error_code = RET_OK; | |||
| #ifdef PROGRAM_WITH_IL | |||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | |||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name_); | |||
| #else | |||
| if (out_mem_type_ == MemType::IMG) { | |||
| kernel_name += "_IMG"; | |||
| kernel_name_ += "_IMG"; | |||
| } else { | |||
| kernel_name += "_BUF"; | |||
| kernel_name_ += "_BUF"; | |||
| } | |||
| std::string program_name = "Arithmetic"; | |||
| std::set<std::string> build_options; | |||
| std::string source = arithmetic_source; | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name_, build_options); | |||
| #endif | |||
| if (error_code != RET_OK) { | |||
| return error_code; | |||
| } | |||
| Image2dGetWorkGroupSize(); | |||
| SetGlobalLocal(); | |||
| InitWeights(); | |||
| SetArgs(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| SetConstArgs(); | |||
| MS_LOG(DEBUG) << kernel_name_ << " Init Done!"; | |||
| return RET_OK; | |||
| } | |||
| @@ -287,64 +270,44 @@ int ArithmeticOpenCLKernel::Run() { | |||
| auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1]; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLArithmeticKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = | |||
| new (std::nothrow) ArithmeticOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "Create OpenCL Arithmetic kernel failed!"; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init kernel failed, name: Arithmetic"; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mul, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Add, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sub, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Div, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalAnd, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalOr, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Maximum, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Minimum, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorDiv, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorMod, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SquaredDifference, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Equal, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_NotEqual, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Less, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LessEqual, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Greater, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_GreaterEqual, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Eltwise, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mul, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Add, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Sub, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Div, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalAnd, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalOr, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Maximum, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Minimum, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorDiv, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorMod, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SquaredDifference, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Equal, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_NotEqual, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Less, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LessEqual, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Greater, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_GreaterEqual, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Eltwise, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mul, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Add, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sub, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Div, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalAnd, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalOr, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Maximum, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Minimum, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorDiv, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorMod, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SquaredDifference, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Equal, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_NotEqual, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Less, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LessEqual, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Greater, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_GreaterEqual, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Eltwise, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mul, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Add, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Sub, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Div, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalAnd, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalOr, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Maximum, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Minimum, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorDiv, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorMod, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SquaredDifference, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Equal, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_NotEqual, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Less, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LessEqual, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Greater, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_GreaterEqual, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Eltwise, OpenCLKernelCreator<ArithmeticOpenCLKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -18,6 +18,7 @@ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_H_ | |||
| #include <vector> | |||
| #include <string> | |||
| #include "src/runtime/kernel/arm/fp32/arithmetic_fp32.h" | |||
| #include "src/runtime/kernel/opencl/opencl_kernel.h" | |||
| @@ -30,24 +31,21 @@ class ArithmeticOpenCLKernel : public OpenCLKernel { | |||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||
| ~ArithmeticOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Run() override; | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| int InitWeights() override; | |||
| int SetArgs(); | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| std::vector<size_t> InitGlobalSize() const; | |||
| void Image2dGetWorkGroupSize(); | |||
| cl::Kernel kernel_; | |||
| bool element_flag_{true}; | |||
| float activation_min_{-FLT_MAX}; | |||
| float activation_max_{FLT_MAX}; | |||
| std::vector<std::vector<int>> inputs_nhwc_shapes_; | |||
| std::vector<void *> inputs_weight_ptrs_; | |||
| std::vector<size_t> local_size_; | |||
| std::vector<size_t> global_size_; | |||
| std::string kernel_name_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -31,13 +31,17 @@ using mindspore::schema::PrimitiveType_DeConv2D; | |||
| namespace mindspore::kernel { | |||
| int Conv2dTransposeOpenCLKernel::Init() { | |||
| int Conv2dTransposeOpenCLKernel::CheckSpecs() { | |||
| ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_); | |||
| if (param->pad_l_ != param->pad_r_ || param->kernel_h_ - param->stride_h_ != 2 * param->pad_l_ || | |||
| param->pad_u_ != param->pad_d_ || param->kernel_w_ - param->stride_w_ != 2 * param->pad_u_) { | |||
| MS_LOG(ERROR) << "only support kernel - stride == 2 * pad"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int Conv2dTransposeOpenCLKernel::Prepare() { | |||
| std::string kernel_name = "conv2d_transpose_NHWC4"; | |||
| enable_fp16_ = ocl_runtime_->GetFp16Enable(); | |||
| #ifdef PROGRAM_WITH_IL | |||
| @@ -49,12 +53,56 @@ int Conv2dTransposeOpenCLKernel::Init() { | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| #endif | |||
| PadWeight(); | |||
| InitWeights(); | |||
| SetGlobalLocal(); | |||
| SetConstArgs(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| void Conv2dTransposeOpenCLKernel::PadWeight() { | |||
| void Conv2dTransposeOpenCLKernel::SetGlobalLocal() { | |||
| ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_); | |||
| int co = out_tensors_[0]->shape()[3]; | |||
| int co4 = UP_DIV(co, C4NUM); | |||
| int stride_h = param->stride_h_; | |||
| int stride_w = param->stride_w_; | |||
| int oh = out_tensors_[0]->shape()[1]; | |||
| int ow = out_tensors_[0]->shape()[2]; | |||
| local_size_ = {16, 1, 16}; | |||
| global_size_ = {(size_t)UP_ROUND(oh / 2, stride_h), (size_t)UP_ROUND(ow / 2, stride_w), (size_t)co4}; | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| void Conv2dTransposeOpenCLKernel::SetConstArgs() { | |||
| int arg_cnt = 2; | |||
| ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_); | |||
| int ci = in_tensors_[0]->shape()[3]; | |||
| int co = out_tensors_[0]->shape()[3]; | |||
| int kh = param->kernel_h_; | |||
| int kw = param->kernel_w_; | |||
| int pad_h = param->pad_l_; | |||
| int pad_w = param->pad_u_; | |||
| int stride_h = param->stride_h_; | |||
| int stride_w = param->stride_w_; | |||
| int oh = out_tensors_[0]->shape()[1]; | |||
| int ow = out_tensors_[0]->shape()[2]; | |||
| int h = in_tensors_[0]->shape()[1]; | |||
| int w = in_tensors_[0]->shape()[2]; | |||
| cl_int2 kernel_size = {kh, kw}; | |||
| cl_int2 stride = {stride_h, stride_w}; | |||
| cl_int2 padding = {pad_h, pad_w}; | |||
| cl_int4 src_size = {h, w, UP_DIV(ci, C4NUM), 1}; | |||
| cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), 1}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size); | |||
| } | |||
| int Conv2dTransposeOpenCLKernel::InitWeights() { | |||
| ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_); | |||
| int ci = in_tensors_[0]->shape()[3]; | |||
| int co = out_tensors_[0]->shape()[3]; | |||
| @@ -138,67 +186,18 @@ void Conv2dTransposeOpenCLKernel::PadWeight() { | |||
| } | |||
| } | |||
| allocator->UnmapBuffer(bias_); | |||
| return RET_OK; | |||
| } | |||
| int Conv2dTransposeOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_); | |||
| int ci = in_tensors_[0]->shape()[3]; | |||
| int co = out_tensors_[0]->shape()[3]; | |||
| int co4 = UP_DIV(co, C4NUM); | |||
| int kh = param->kernel_h_; | |||
| int kw = param->kernel_w_; | |||
| int pad_h = param->pad_l_; | |||
| int pad_w = param->pad_u_; | |||
| int stride_h = param->stride_h_; | |||
| int stride_w = param->stride_w_; | |||
| int oh = out_tensors_[0]->shape()[1]; | |||
| int ow = out_tensors_[0]->shape()[2]; | |||
| int h = in_tensors_[0]->shape()[1]; | |||
| int w = in_tensors_[0]->shape()[2]; | |||
| // local size should less than MAX_GROUP_SIZE | |||
| std::vector<size_t> local = {16, 1, 16}; | |||
| std::vector<size_t> global = {(size_t)UP_ROUND(oh / 2, stride_h), (size_t)UP_ROUND(ow / 2, stride_w), (size_t)co4}; | |||
| cl_int2 kernel_size = {kh, kw}; | |||
| cl_int2 stride = {stride_h, stride_w}; | |||
| cl_int2 padding = {pad_h, pad_w}; | |||
| cl_int4 src_size = {h, w, UP_DIV(ci, C4NUM), 1}; | |||
| cl_int4 dst_size = {oh, ow, UP_DIV(co, C4NUM), 1}; | |||
| int arg_cnt = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padWeight_, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size); | |||
| ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLConv2dTransposeKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, | |||
| OpParameter *opParameter, const lite::InnerContext *ctx, | |||
| const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = | |||
| new (std::nothrow) Conv2dTransposeOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != mindspore::lite::RET_OK) { | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DeConv2D, OpenCLConv2dTransposeKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_DeConv2D, OpenCLConv2dTransposeKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DeConv2D, OpenCLKernelCreator<Conv2dTransposeOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_DeConv2D, OpenCLKernelCreator<Conv2dTransposeOpenCLKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -32,12 +32,14 @@ class Conv2dTransposeOpenCLKernel : public OpenCLKernel { | |||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||
| ~Conv2dTransposeOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Run() override; | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| int InitWeights() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| void PadWeight(); | |||
| cl::Kernel kernel_; | |||
| void *padWeight_{nullptr}; | |||
| void *bias_{nullptr}; | |||
| @@ -34,7 +34,11 @@ using mindspore::schema::PrimitiveType_FullConnection; | |||
| namespace mindspore::kernel { | |||
| int FullConnectionOpenCLKernel::Init() { | |||
| std::string kernel_name = "FullConnection_NHWC4"; | |||
| // deleted soon | |||
| return CheckSpecs(); | |||
| } | |||
| int FullConnectionOpenCLKernel::CheckSpecs() { | |||
| auto param = reinterpret_cast<MatMulParameter *>(op_parameter_); | |||
| transposeA = param->a_transpose_; | |||
| if (transposeA) { | |||
| @@ -48,9 +52,6 @@ int FullConnectionOpenCLKernel::Init() { | |||
| MS_LOG(ERROR) << "fullconnection only support input output shape size = 2 or 4"; | |||
| return RET_ERROR; | |||
| } | |||
| // call default move constructor(elemwised moved) | |||
| inShape = Image2DInfo(in_tensors_[0]); | |||
| outShape = Image2DInfo(out_tensors_[0]); | |||
| switch (param->act_type_) { | |||
| case ActType_No: | |||
| break; | |||
| @@ -65,6 +66,13 @@ int FullConnectionOpenCLKernel::Init() { | |||
| MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int FullConnectionOpenCLKernel::Prepare() { | |||
| std::string kernel_name = "FullConnection_NHWC4"; | |||
| inShape = Image2DInfo(in_tensors_[0]); | |||
| outShape = Image2DInfo(out_tensors_[0]); | |||
| #ifdef PROGRAM_WITH_IL | |||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | |||
| #else | |||
| @@ -74,13 +82,14 @@ int FullConnectionOpenCLKernel::Init() { | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| #endif | |||
| PadWeight(); | |||
| InitWeights(); | |||
| SetConstArgs(); | |||
| SetGlobalLocal(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return RET_OK; | |||
| } | |||
| void FullConnectionOpenCLKernel::PadWeight() { | |||
| int FullConnectionOpenCLKernel::InitWeights() { | |||
| auto allocator = ocl_runtime_->GetAllocator(); | |||
| int ci = inShape.C; | |||
| int ci4 = UP_DIV(ci, C4NUM); | |||
| @@ -167,48 +176,37 @@ void FullConnectionOpenCLKernel::PadWeight() { | |||
| } | |||
| } | |||
| allocator->UnmapBuffer(bias_); | |||
| return RET_OK; | |||
| } | |||
| int FullConnectionOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| void FullConnectionOpenCLKernel::SetGlobalLocal() { | |||
| std::vector<size_t> local = {32, 4, 1}; | |||
| std::vector<size_t> global = {UP_DIV(outShape.C, C4NUM), 4, outShape.N}; | |||
| int arg_count = 0; | |||
| AlignGlobalLocal(global, local); | |||
| } | |||
| void FullConnectionOpenCLKernel::SetConstArgs() { | |||
| int arg_count = 2; | |||
| cl_int4 in_shape = {static_cast<int>(inShape.N), static_cast<int>(inShape.H), static_cast<int>(inShape.W), | |||
| static_cast<int>(inShape.C)}; | |||
| cl_int2 out_shape = {static_cast<int>(outShape.N), static_cast<int>(outShape.C)}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, activation_min_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, activation_max_); | |||
| ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLFullConnectionKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, | |||
| OpParameter *opParameter, const lite::InnerContext *ctx, | |||
| const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = | |||
| new (std::nothrow) FullConnectionOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| int FullConnectionOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| int arg_count = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| return RET_OK; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FullConnection, OpenCLFullConnectionKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FullConnection, OpenCLFullConnectionKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FullConnection, OpenCLKernelCreator<FullConnectionOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FullConnection, OpenCLKernelCreator<FullConnectionOpenCLKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -31,12 +31,15 @@ class FullConnectionOpenCLKernel : public OpenCLKernel { | |||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||
| ~FullConnectionOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Run() override; | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| int InitWeights() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int Init() override; | |||
| private: | |||
| void PadWeight(); | |||
| cl::Kernel kernel_; | |||
| void *padWeight_{nullptr}; | |||
| void *bias_{nullptr}; | |||
| @@ -30,8 +30,7 @@ using mindspore::schema::PrimitiveType_MatMul; | |||
| namespace mindspore::kernel { | |||
| int MatMulOpenCLKernel::Init() { | |||
| std::string kernel_name = "MatMul_NHWC4"; | |||
| int MatMulOpenCLKernel::CheckSpecs() { | |||
| auto param = reinterpret_cast<MatMulParameter *>(op_parameter_); | |||
| transposeA = param->a_transpose_; | |||
| if (transposeA) { | |||
| @@ -45,6 +44,11 @@ int MatMulOpenCLKernel::Init() { | |||
| MS_LOG(ERROR) << "matmul only support input shape size=2 or 4."; | |||
| return mindspore::lite::RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int MatMulOpenCLKernel::Prepare() { | |||
| std::string kernel_name = "MatMul_NHWC4"; | |||
| dims = in_tensors_[0]->shape().size(); | |||
| for (int i = 0; i < dims; i++) { | |||
| inShape[MAX_DIMS - dims + i] = in_tensors_[0]->shape()[i]; | |||
| @@ -61,13 +65,14 @@ int MatMulOpenCLKernel::Init() { | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| #endif | |||
| PadWeight(); | |||
| InitWeights(); | |||
| SetConstArgs(); | |||
| SetGlobalLocal(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| void MatMulOpenCLKernel::PadWeight() { | |||
| int MatMulOpenCLKernel::InitWeights() { | |||
| // ABMCI @ ABCICO = ABMCO | |||
| auto allocator = ocl_runtime_->GetAllocator(); | |||
| int ci = inShape[3]; | |||
| @@ -128,45 +133,36 @@ void MatMulOpenCLKernel::PadWeight() { | |||
| } | |||
| } | |||
| allocator->UnmapBuffer(padWeight_); | |||
| return RET_OK; | |||
| } | |||
| int MatMulOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| void MatMulOpenCLKernel::SetGlobalLocal() { | |||
| // local size should less than MAX_GROUP_SIZE | |||
| std::vector<size_t> local = {32, 4, 1}; | |||
| std::vector<size_t> global = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM), | |||
| 4 * static_cast<size_t>(outShape[0]) * static_cast<size_t>(outShape[1]), | |||
| static_cast<size_t>(outShape[2])}; | |||
| int arg_count = 0; | |||
| AlignGlobalLocal(global, local); | |||
| } | |||
| void MatMulOpenCLKernel::SetConstArgs() { | |||
| int arg_count = 2; | |||
| cl_int4 in_shape = {inShape[0], inShape[1], inShape[2], inShape[3]}; | |||
| cl_int4 out_shape = {outShape[0], outShape[1], outShape[2], outShape[3]}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); | |||
| ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLMatMulKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) MatMulOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != mindspore::lite::RET_OK) { | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| int MatMulOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| int arg_count = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLMatMulKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLMatMulKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_MatMul, OpenCLKernelCreator<MatMulOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_MatMul, OpenCLKernelCreator<MatMulOpenCLKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -31,12 +31,14 @@ class MatMulOpenCLKernel : public OpenCLKernel { | |||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||
| ~MatMulOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Run() override; | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| int InitWeights() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| void PadWeight(); | |||
| cl::Kernel kernel_; | |||
| void *padWeight_{nullptr}; | |||
| bool enable_fp16_{false}; | |||
| @@ -21,8 +21,7 @@ | |||
| #include "src/kernel_registry.h" | |||
| #include "src/runtime/kernel/opencl/utils.h" | |||
| #ifndef PROGRAM_WITH_IL | |||
| #include "src/runtime/kernel/opencl/cl/avg_pool2d.cl.inc" | |||
| #include "src/runtime/kernel/opencl/cl/max_pool2d.cl.inc" | |||
| #include "src/runtime/kernel/opencl/cl/pooling2d.cl.inc" | |||
| #endif | |||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | |||
| @@ -36,27 +35,25 @@ using mindspore::schema::PrimitiveType_Pooling; | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| int PoolingOpenCLKernel::Init() { | |||
| int PoolingOpenCLKernel::CheckSpecs() { | |||
| if (parameter_->pool_mode_ != PoolMode_MaxPool && parameter_->pool_mode_ != PoolMode_AvgPool) { | |||
| MS_LOG(ERROR) << "Init `Pooling2d` kernel failed, unsupported pool mode!"; | |||
| return RET_ERROR; | |||
| } | |||
| if (parameter_->act_type_ != ActType_No && parameter_->act_type_ != ActType_Relu) { | |||
| MS_LOG(ERROR) << "Unsupported activation type " << parameter_->act_type_; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int PoolingOpenCLKernel::Prepare() { | |||
| std::string kernel_name; | |||
| #ifndef PROGRAM_WITH_IL | |||
| std::string source; | |||
| std::string program_name; | |||
| #endif | |||
| if (parameter_->pool_mode_ == PoolMode_MaxPool) { | |||
| kernel_name = "MaxPooling2d"; | |||
| #ifndef PROGRAM_WITH_IL | |||
| source = max_pool2d_source; | |||
| program_name = "MaxPooling2d"; | |||
| #endif | |||
| } else if (parameter_->pool_mode_ == PoolMode_AvgPool) { | |||
| kernel_name = "AvgPooling2d"; | |||
| #ifndef PROGRAM_WITH_IL | |||
| source = avg_pool2d_source; | |||
| program_name = "AvgPooling2d"; | |||
| #endif | |||
| } else { | |||
| MS_LOG(ERROR) << "Init `Pooling2d` kernel failed!"; | |||
| return RET_INVALID_OP_NAME; | |||
| } | |||
| switch (parameter_->act_type_) { | |||
| case ActType_No: | |||
| @@ -66,42 +63,35 @@ int PoolingOpenCLKernel::Init() { | |||
| break; | |||
| default: | |||
| MS_LOG(ERROR) << "Unsupported activation type " << parameter_->act_type_; | |||
| return RET_ERROR; | |||
| break; | |||
| } | |||
| enable_fp16_ = ocl_runtime_->GetFp16Enable(); | |||
| #ifdef PROGRAM_WITH_IL | |||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | |||
| #else | |||
| kernel_name += "_NHWC4"; | |||
| if (out_mem_type_ == MemType::BUF) { | |||
| MS_LOG(ERROR) << "buffer output not support yet."; | |||
| return mindspore::lite::RET_ERROR; | |||
| } else { | |||
| kernel_name += "_IMG"; | |||
| } | |||
| kernel_name += "_IMG"; | |||
| std::set<std::string> build_options; | |||
| std::string source = pooling2d_source; | |||
| std::string program_name = "Pooling2d"; | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| #endif | |||
| InitGlobalSize(); | |||
| SetConstArgs(); | |||
| SetGlobalLocal(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| void PoolingOpenCLKernel::InitGlobalSize() { | |||
| void PoolingOpenCLKernel::SetGlobalLocal() { | |||
| const size_t global_x = out_tensors_[0]->shape()[1]; | |||
| const size_t global_y = out_tensors_[0]->shape()[2]; | |||
| const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); | |||
| global_size_ = {global_z, global_y, global_x}; | |||
| int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())()); | |||
| local_size_ = GetCommonLocalSize(global_size_, max_work_group_size); | |||
| global_size_ = GetCommonGlobalSize(local_size_, global_size_); | |||
| global_range_ = {global_z, global_y, global_x}; | |||
| local_range_ = {}; | |||
| } | |||
| int PoolingOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| void PoolingOpenCLKernel::SetConstArgs() { | |||
| int slices = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); | |||
| cl_int4 input_shape = {in_tensors_[0]->shape()[1], in_tensors_[0]->shape()[2], in_tensors_[0]->shape()[3], slices}; | |||
| cl_int4 output_shape = {out_tensors_[0]->shape()[1], out_tensors_[0]->shape()[2], out_tensors_[0]->shape()[3], | |||
| @@ -109,40 +99,24 @@ int PoolingOpenCLKernel::Run() { | |||
| cl_int2 stride = {parameter_->stride_h_, parameter_->stride_w_}; | |||
| cl_int2 kernel_size = {parameter_->window_h_, parameter_->window_w_}; | |||
| cl_int2 padding = {parameter_->pad_u_, parameter_->pad_l_}; | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| int arg_idx = 2; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, stride); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, kernel_size); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, padding); | |||
| ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLPooling2dKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) PoolingOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "Create OpenCL Pooling kernel failed!"; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (RET_OK != ret) { | |||
| MS_LOG(ERROR) << "Init OpenCL Pooling kernel failed!"; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| int PoolingOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Pooling, OpenCLKernelCreator<PoolingOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Pooling, OpenCLKernelCreator<PoolingOpenCLKernel>) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -31,14 +31,15 @@ class PoolingOpenCLKernel : public OpenCLKernel { | |||
| : OpenCLKernel(parameter, inputs, outputs), parameter_(reinterpret_cast<PoolingParameter *>(parameter)) {} | |||
| ~PoolingOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Run() override; | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| void InitGlobalSize(); | |||
| PoolingParameter *parameter_; | |||
| cl::Kernel kernel_; | |||
| bool enable_fp16_{false}; | |||
| std::vector<size_t> local_size_; | |||
| std::vector<size_t> global_size_; | |||
| }; | |||
| @@ -40,14 +40,19 @@ using mindspore::schema::ReduceMode_ReduceSumSquare; | |||
| namespace mindspore::kernel { | |||
| int ReduceOpenCLKernel::Init() { | |||
| InitNHWCShape(); | |||
| auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_); | |||
| if (reduce_param == nullptr) { | |||
| return RET_NULL_PTR; | |||
| std::string ReduceOpenCLKernel::GetReduceTypeStr(int type) { | |||
| static const std::map<int, std::string> reduce_type2str{{ReduceMode_ReduceMean, "mean"}, | |||
| {ReduceMode_ReduceSum, "sum"}}; | |||
| auto result_iter = reduce_type2str.find(type); | |||
| if (result_iter != reduce_type2str.end()) { | |||
| return result_iter->second; | |||
| } | |||
| std::map<int, std::string> reduce_type2str{{ReduceMode_ReduceMean, "mean"}, {ReduceMode_ReduceSum, "sum"}}; | |||
| if (reduce_type2str.find(reduce_param->mode_) == reduce_type2str.end()) { | |||
| return ""; | |||
| } | |||
| int ReduceOpenCLKernel::CheckSpecs() { | |||
| auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_); | |||
| if (GetReduceTypeStr(reduce_param->mode_).empty()) { | |||
| MS_LOG(ERROR) << "not supported reduce type:" << reduce_param->mode_; | |||
| return RET_PARAM_INVALID; | |||
| } | |||
| @@ -67,7 +72,17 @@ int ReduceOpenCLKernel::Init() { | |||
| MS_LOG(ERROR) << "reduce axis (2,3) should keep dims"; | |||
| return RET_PARAM_INVALID; | |||
| } | |||
| std::string kernel_name = reduce_type2str.at(reduce_param->mode_); | |||
| return RET_OK; | |||
| } | |||
| int ReduceOpenCLKernel::Prepare() { | |||
| outShape = Image2DInfo(out_tensors_[0]); | |||
| auto reduce_param = reinterpret_cast<ReduceParameter *>(op_parameter_); | |||
| if (reduce_param == nullptr) { | |||
| return RET_NULL_PTR; | |||
| } | |||
| std::string kernel_name = GetReduceTypeStr(reduce_param->mode_); | |||
| if (wc_reduce_) { | |||
| kernel_name += "_WC"; | |||
| } | |||
| @@ -77,7 +92,6 @@ int ReduceOpenCLKernel::Init() { | |||
| kernel_name += "_local"; | |||
| } | |||
| kernel_name += "_NHWC4"; | |||
| enable_fp16_ = ocl_runtime_->GetFp16Enable(); | |||
| #ifdef PROGRAM_WITH_IL | |||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | |||
| @@ -88,32 +102,26 @@ int ReduceOpenCLKernel::Init() { | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| #endif | |||
| SetConstArgs(); | |||
| SetGlobalLocal(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| void ReduceOpenCLKernel::InitNHWCShape() { | |||
| std::vector<int> shapex = out_tensors_[0]->shape(); | |||
| size_t n = 1, h = 1, w = 1, c = 1; | |||
| if (shapex.size() == 2) { | |||
| n = shapex[0]; | |||
| c = shapex[1]; | |||
| } else if (shapex.size() == 4) { | |||
| n = shapex[0]; | |||
| h = shapex[1]; | |||
| w = shapex[2]; | |||
| c = shapex[3]; | |||
| } | |||
| nhwc_shape_ = {n, h, w, c}; | |||
| } | |||
| int ReduceOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| void ReduceOpenCLKernel::SetConstArgs() { | |||
| std::vector<int> shapex = in_tensors_[0]->shape(); | |||
| int h = shapex[1]; | |||
| int w = shapex[2]; | |||
| int c = shapex[3]; | |||
| int c4 = UP_DIV(c, C4NUM); | |||
| cl_int4 size = {h, w, c4, c}; | |||
| int arg_idx = 2; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size); | |||
| } | |||
| void ReduceOpenCLKernel::SetGlobalLocal() { | |||
| std::vector<int> shapex = in_tensors_[0]->shape(); | |||
| int h = shapex[1]; | |||
| int c = shapex[3]; | |||
| int c4 = UP_DIV(c, C4NUM); | |||
| std::vector<size_t> local = {}; | |||
| if (use_local_) { | |||
| local = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD}; | |||
| @@ -122,35 +130,20 @@ int ReduceOpenCLKernel::Run() { | |||
| if (wc_reduce_) { | |||
| global = {static_cast<size_t>(h), 1, 1}; | |||
| } | |||
| cl_int4 size = {h, w, c4, c}; | |||
| AlignGlobalLocal(global, local); | |||
| } | |||
| int ReduceOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, size); | |||
| ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLReduceKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) ReduceOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed."; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != mindspore::lite::RET_OK) { | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mean, OpenCLReduceKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mean, OpenCLReduceKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reduce, OpenCLReduceKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reduce, OpenCLReduceKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mean, OpenCLKernelCreator<ReduceOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mean, OpenCLKernelCreator<ReduceOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reduce, OpenCLKernelCreator<ReduceOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reduce, OpenCLKernelCreator<ReduceOpenCLKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -18,7 +18,7 @@ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_REDUCE_H_ | |||
| #include <vector> | |||
| #include <string> | |||
| #include "src/lite_kernel.h" | |||
| #include "src/runtime/kernel/opencl/opencl_kernel.h" | |||
| #include "nnacl/reduce_parameter.h" | |||
| @@ -31,14 +31,16 @@ class ReduceOpenCLKernel : public OpenCLKernel { | |||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||
| ~ReduceOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Run() override; | |||
| void InitNHWCShape(); | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| static std::string GetReduceTypeStr(int type); | |||
| cl::Kernel kernel_; | |||
| bool enable_fp16_{false}; | |||
| std::vector<size_t> nhwc_shape_; | |||
| Image2DInfo outShape = Image2DInfo(nullptr); | |||
| bool use_local_{false}; | |||
| bool wc_reduce_{false}; | |||
| static const size_t LOCAL_CACHE_THREAD{16}; | |||
| @@ -32,7 +32,6 @@ class ReshapeOpenCLKernel : public OpenCLKernel { | |||
| int Run() override; | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| @@ -32,27 +32,32 @@ using mindspore::schema::PrimitiveType_Resize; | |||
| namespace mindspore::kernel { | |||
| int ResizeOpenCLKernel::Init() { | |||
| auto resize_param = reinterpret_cast<ResizeParameter *>(op_parameter_); | |||
| if (resize_param == nullptr) { | |||
| return RET_NULL_PTR; | |||
| } | |||
| alignCorner = resize_param->align_corners_; | |||
| preserveAspectRatio = resize_param->preserve_aspect_ratio_; | |||
| int ResizeOpenCLKernel::CheckSpecs() { | |||
| auto in_shape = in_tensors_[0]->shape(); | |||
| auto out_shape = out_tensors_[0]->shape(); | |||
| if (in_shape.size() != 4 || out_shape.size() != 4 || in_shape[0] != out_shape[0] || in_shape[3] != out_shape[3]) { | |||
| MS_LOG(ERROR) << "resize op only support 4D and axes HW"; | |||
| return RET_PARAM_INVALID; | |||
| } | |||
| auto resize_param = reinterpret_cast<ResizeParameter *>(op_parameter_); | |||
| if (resize_param->method_ != schema::ResizeMethod_LINEAR && resize_param->method_ != schema::ResizeMethod_NEAREST) { | |||
| MS_LOG(ERROR) << "unsupported resize method:" << resize_param->method_; | |||
| return RET_PARAM_INVALID; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int ResizeOpenCLKernel::Prepare() { | |||
| auto resize_param = reinterpret_cast<ResizeParameter *>(op_parameter_); | |||
| alignCorner = resize_param->align_corners_; | |||
| preserveAspectRatio = resize_param->preserve_aspect_ratio_; | |||
| auto in_shape = in_tensors_[0]->shape(); | |||
| auto out_shape = out_tensors_[0]->shape(); | |||
| std::string kernel_name = "resize"; | |||
| if (resize_param->method_ == schema::ResizeMethod_LINEAR) { | |||
| kernel_name += "_bilinear"; | |||
| } else if (resize_param->method_ == schema::ResizeMethod_NEAREST) { | |||
| kernel_name += "_nearest_neighbor"; | |||
| } else { | |||
| MS_LOG(ERROR) << "unsupported resize method:" << resize_param->method_; | |||
| return RET_PARAM_INVALID; | |||
| } | |||
| kernel_name += "_NHWC4"; | |||
| #ifdef PROGRAM_WITH_IL | |||
| @@ -64,6 +69,8 @@ int ResizeOpenCLKernel::Init() { | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| #endif | |||
| SetConstArgs(); | |||
| SetGlobalLocal(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return RET_OK; | |||
| } | |||
| @@ -74,8 +81,7 @@ float ResizeOpenCLKernel::getResizeScaleFactor(int input_size, int output_size) | |||
| : static_cast<float>(input_size) / static_cast<float>(output_size); | |||
| } | |||
| int ResizeOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| void ResizeOpenCLKernel::SetConstArgs() { | |||
| auto in_shape = in_tensors_[0]->shape(); | |||
| auto out_shape = out_tensors_[0]->shape(); | |||
| int n = out_shape[0]; | |||
| @@ -85,39 +91,30 @@ int ResizeOpenCLKernel::Run() { | |||
| int c4 = UP_DIV(c, C4NUM); | |||
| float scale_h = getResizeScaleFactor(in_tensors_[0]->shape()[1], out_tensors_[0]->shape()[1]); | |||
| float scale_w = getResizeScaleFactor(in_tensors_[0]->shape()[2], out_tensors_[0]->shape()[2]); | |||
| std::vector<size_t> local = {}; | |||
| std::vector<size_t> global = {static_cast<size_t>(c4), static_cast<size_t>(w), static_cast<size_t>(h)}; | |||
| cl_int4 in_size = {in_shape[0], in_shape[1], in_shape[2], UP_DIV(in_shape[3], C4NUM)}; | |||
| cl_int4 out_size = {n, h, w, c4}; | |||
| cl_float2 scale = {scale_h, scale_w}; | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| int arg_idx = 2; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_size); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_size); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, scale); | |||
| ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLResizeKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) ResizeOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed."; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| void ResizeOpenCLKernel::SetGlobalLocal() { | |||
| local_range_ = {}; | |||
| auto out_shape = Image2DInfo(out_tensors_[0]); | |||
| global_range_ = {out_shape.Slice, out_shape.W, out_shape.H}; | |||
| } | |||
| int ResizeOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| return RET_OK; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Resize, OpenCLResizeKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Resize, OpenCLResizeKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Resize, OpenCLKernelCreator<ResizeOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Resize, OpenCLKernelCreator<ResizeOpenCLKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -31,8 +31,11 @@ class ResizeOpenCLKernel : public OpenCLKernel { | |||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||
| ~ResizeOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Run() override; | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| float getResizeScaleFactor(int input_size, int output_size); | |||
| @@ -42,51 +42,17 @@ std::vector<float> SoftmaxOpenCLKernel::GetMaskForLastChannel(int channels) { | |||
| return mask; | |||
| } | |||
| int SoftmaxOpenCLKernel::InitGlobalSize() { | |||
| size_t global_x, global_y; | |||
| const size_t global_z = 1; | |||
| if (axis_ == 1) { | |||
| global_x = UP_DIV(nhwc_shape_[3], C4NUM); | |||
| global_y = nhwc_shape_[2]; | |||
| } else if (axis_ == 2) { | |||
| global_x = UP_DIV(nhwc_shape_[3], C4NUM); | |||
| global_y = nhwc_shape_[1]; | |||
| } else if (axis_ == 3) { | |||
| global_x = nhwc_shape_[2]; | |||
| global_y = nhwc_shape_[1]; | |||
| } else { | |||
| global_x = 1; | |||
| global_y = 1; | |||
| } | |||
| global_size_ = {global_x, global_y, global_z}; | |||
| return lite::RET_OK; | |||
| } | |||
| int SoftmaxOpenCLKernel::SetWorkGroupSize() { | |||
| // set work group size | |||
| InitGlobalSize(); | |||
| int max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())()); | |||
| local_size_ = GetCommonLocalSize(global_size_, max_work_group_size); | |||
| global_size_ = GetCommonGlobalSize(local_size_, global_size_); | |||
| return lite::RET_OK; | |||
| } | |||
| int SoftmaxOpenCLKernel::SetWorkGroupSize1x1() { | |||
| local_size_ = {32, 1, 1}; | |||
| global_size_ = {32, 1, 1}; | |||
| return lite::RET_OK; | |||
| } | |||
| int SoftmaxOpenCLKernel::Init() { | |||
| std::string kernel_name = "SoftMax"; | |||
| std::string program_name = "SoftMax"; | |||
| auto softmax_param = reinterpret_cast<SoftmaxParameter *>(op_parameter_); | |||
| axis_ = softmax_param->axis_; | |||
| int SoftmaxOpenCLKernel::CheckSpecs() { | |||
| axis_ = parameter_->axis_; | |||
| auto in_shape = in_tensors_[0]->shape(); | |||
| if (in_shape.size() > 4) { | |||
| MS_LOG(ERROR) << "Init `Softmax` kernel failed: Unsupported shape size: " << in_shape.size(); | |||
| return RET_ERROR; | |||
| } | |||
| if (in_shape[0] > 1) { | |||
| MS_LOG(ERROR) << "Init `Softmax` kernel failed: Unsupported multi-batch."; | |||
| return RET_ERROR; | |||
| } | |||
| if (axis_ < 0) { | |||
| axis_ = in_shape.size() + axis_; | |||
| } | |||
| @@ -95,11 +61,15 @@ int SoftmaxOpenCLKernel::Init() { | |||
| MS_LOG(ERROR) << "Init `Softmax` kernel failed: softmax axis should be H W or C"; | |||
| return RET_ERROR; | |||
| } | |||
| nhwc_shape_ = GetNHWCShape(in_shape); | |||
| return RET_OK; | |||
| } | |||
| int SoftmaxOpenCLKernel::Prepare() { | |||
| std::string kernel_name = "SoftMax"; | |||
| out_shape = Image2DInfo(out_tensors_[0]); | |||
| std::string source = softmax_source; | |||
| enable_fp16_ = ocl_runtime_->GetFp16Enable(); | |||
| // framework not set this param yet! just use default. | |||
| if (nhwc_shape_[1] == 1 && nhwc_shape_[2] == 1 && axis_ == 3) { | |||
| if (out_shape.H == 1 && out_shape.W == 1 && axis_ == 3) { | |||
| // support 4d tensor | |||
| onexone_flag_ = true; | |||
| kernel_name += "1x1"; | |||
| @@ -112,62 +82,63 @@ int SoftmaxOpenCLKernel::Init() { | |||
| kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name); | |||
| #else | |||
| std::set<std::string> build_options; | |||
| std::string program_name = "SoftMax"; | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| #endif | |||
| SetConstArgs(); | |||
| SetGlobalLocal(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return lite::RET_OK; | |||
| } | |||
| int SoftmaxOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| void SoftmaxOpenCLKernel::SetGlobalLocal() { | |||
| if (onexone_flag_) { | |||
| local_size_ = {32}; | |||
| global_size_ = {32}; | |||
| } else { | |||
| size_t global_x, global_y; | |||
| if (axis_ == 1) { | |||
| global_x = out_shape.Slice; | |||
| global_y = out_shape.W; | |||
| } else if (axis_ == 2) { | |||
| global_x = out_shape.Slice; | |||
| global_y = out_shape.H; | |||
| } else if (axis_ == 3) { | |||
| global_x = out_shape.W; | |||
| global_y = out_shape.H; | |||
| } else { | |||
| global_x = 1; | |||
| global_y = 1; | |||
| } | |||
| global_size_ = {global_x, global_y}; | |||
| local_size_ = {}; | |||
| } | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| int channel = nhwc_shape_[3]; | |||
| int c4 = UP_DIV(channel, C4NUM); | |||
| void SoftmaxOpenCLKernel::SetConstArgs() { | |||
| int arg_idx = 2; | |||
| int channel = out_shape.C; | |||
| int c4 = out_shape.Slice; | |||
| auto mask_ = GetMaskForLastChannel(channel); | |||
| cl_float4 mask = {mask_[0], mask_[1], mask_[2], mask_[3]}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, mask); | |||
| cl_int4 input_shape = {nhwc_shape_[0], nhwc_shape_[1], nhwc_shape_[2], c4}; | |||
| cl_int4 input_shape = {static_cast<int>(out_shape.N), static_cast<int>(out_shape.H), static_cast<int>(out_shape.W), | |||
| c4}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx, input_shape); | |||
| if (onexone_flag_) { | |||
| SetWorkGroupSize1x1(); | |||
| } else { | |||
| SetWorkGroupSize(); | |||
| } | |||
| } | |||
| int SoftmaxOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| // run opengl kernel | |||
| ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| return lite::RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLSoftMaxKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) SoftmaxOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | |||
| free(opParameter); | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| if (inputs[0]->shape()[0] > 1) { | |||
| MS_LOG(ERROR) << "Init `Softmax` kernel failed: Unsupported multi-batch."; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != mindspore::lite::RET_OK) { | |||
| MS_LOG(ERROR) << "Init `Softmax` kernel failed!"; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SoftMax, OpenCLSoftMaxKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SoftMax, OpenCLSoftMaxKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SoftMax, OpenCLKernelCreator<SoftmaxOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SoftMax, OpenCLKernelCreator<SoftmaxOpenCLKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -33,8 +33,11 @@ class SoftmaxOpenCLKernel : public OpenCLKernel { | |||
| } | |||
| ~SoftmaxOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Run() override; | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| int InitGlobalSize(); | |||
| @@ -47,9 +50,8 @@ class SoftmaxOpenCLKernel : public OpenCLKernel { | |||
| bool onexone_flag_{false}; | |||
| std::vector<size_t> local_size_; | |||
| std::vector<size_t> global_size_; | |||
| bool enable_fp16_{false}; | |||
| int axis_{0}; | |||
| std::vector<int> nhwc_shape_; | |||
| Image2DInfo out_shape = Image2DInfo(nullptr); | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -33,9 +33,7 @@ using mindspore::schema::PrimitiveType_Transpose; | |||
| namespace mindspore::kernel { | |||
| int TransposeOpenCLKernel::Init() { | |||
| std::string kernel_name = "transpose"; | |||
| enable_fp16_ = ocl_runtime_->GetFp16Enable(); | |||
| int TransposeOpenCLKernel::CheckSpecs() { | |||
| auto param = reinterpret_cast<TransposeParameter *>(op_parameter_); | |||
| if (in_tensors_[0]->shape().size() != 4 || in_tensors_[0]->shape()[0] > 1) { | |||
| MS_LOG(ERROR) << "Transpose only support 4d tensor and n = 1 yet."; | |||
| @@ -43,16 +41,24 @@ int TransposeOpenCLKernel::Init() { | |||
| } | |||
| if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 3 && param->perm_[2] == 1 && | |||
| param->perm_[3] == 2) { | |||
| kernel_name += "_0312"; | |||
| type = TransposeType::AXIS0312; | |||
| } else if (param->num_axes_ == 4 && param->perm_[0] == 0 && param->perm_[1] == 2 && param->perm_[2] == 3 && | |||
| param->perm_[3] == 1) { | |||
| kernel_name += "_0231"; | |||
| type = TransposeType::AXIS0231; | |||
| } else { | |||
| MS_LOG(ERROR) << "unsupported transpose axes."; | |||
| return mindspore::lite::RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int TransposeOpenCLKernel::Prepare() { | |||
| std::string kernel_name = "transpose"; | |||
| if (type == TransposeType::AXIS0312) { | |||
| kernel_name += "_0312"; | |||
| } else if (type == TransposeType::AXIS0231) { | |||
| kernel_name += "_0231"; | |||
| } | |||
| if (in_tensors_[0]->shape()[2] * UP_DIV(in_tensors_[0]->shape()[3], C4NUM) > MAX_IMAGE2D_SIZE) { | |||
| // just for input | |||
| kernel_name += "_oversize"; | |||
| @@ -68,58 +74,49 @@ int TransposeOpenCLKernel::Init() { | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| #endif | |||
| SetConstArgs(); | |||
| SetGlobalLocal(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| int TransposeOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| void TransposeOpenCLKernel::SetConstArgs() { | |||
| std::vector<int> shapex = out_tensors_[0]->shape(); | |||
| size_t n = shapex[0]; // n=1 | |||
| size_t h = shapex[1]; | |||
| size_t w = shapex[2]; | |||
| size_t c = shapex[3]; | |||
| int arg_idx = 2; | |||
| cl_int4 shape = {static_cast<int>(n), static_cast<int>(h), static_cast<int>(w), static_cast<int>(c)}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, shape); | |||
| } | |||
| void TransposeOpenCLKernel::SetGlobalLocal() { | |||
| std::vector<int> shapex = out_tensors_[0]->shape(); | |||
| size_t h = shapex[1]; | |||
| size_t w = shapex[2]; | |||
| size_t c = shapex[3]; | |||
| size_t c4 = UP_DIV(c, 4); | |||
| std::vector<size_t> local = {}; | |||
| std::vector<size_t> global; | |||
| if (type == TransposeType::AXIS0312) { // NHWC -> NCHW | |||
| global = {UP_DIV(h, C4NUM), w, c4}; | |||
| global_range_ = {UP_DIV(h, C4NUM), w, c4}; | |||
| } else if (type == TransposeType::AXIS0231) { // NCHW -> NHWC | |||
| global = {h, UP_DIV(w, C4NUM), c4}; | |||
| global_range_ = {h, UP_DIV(w, C4NUM), c4}; | |||
| } | |||
| cl_int4 shape = {static_cast<int>(n), static_cast<int>(h), static_cast<int>(w), static_cast<int>(c)}; | |||
| } | |||
| int TransposeOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, shape); | |||
| ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLTransposeKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = | |||
| new (std::nothrow) TransposeOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != mindspore::lite::RET_OK) { | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Transpose, OpenCLTransposeKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Transpose, OpenCLTransposeKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Nhwc2Nchw, OpenCLTransposeKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Nhwc2Nchw, OpenCLTransposeKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Nchw2Nhwc, OpenCLTransposeKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Nchw2Nhwc, OpenCLTransposeKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Transpose, OpenCLKernelCreator<TransposeOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Transpose, OpenCLKernelCreator<TransposeOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Nhwc2Nchw, OpenCLKernelCreator<TransposeOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Nhwc2Nchw, OpenCLKernelCreator<TransposeOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Nchw2Nhwc, OpenCLKernelCreator<TransposeOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Nchw2Nhwc, OpenCLKernelCreator<TransposeOpenCLKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -34,12 +34,14 @@ class TransposeOpenCLKernel : public OpenCLKernel { | |||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||
| ~TransposeOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Run() override; | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| cl::Kernel kernel_; | |||
| bool enable_fp16_{false}; | |||
| TransposeType type{TransposeType::AXIS0312}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -211,15 +211,15 @@ int SubGraphOpenCLKernel::Init() { | |||
| } | |||
| nodes_.insert(nodes_.end(), out_convert_ops_.begin(), out_convert_ops_.end()); | |||
| UpdateTensorDataType(); | |||
| MallocTensorWithReuse(); | |||
| ret = SubGraphKernel::Prepare(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "OpenCL prepare fail"; | |||
| return ret; | |||
| } | |||
| UpdateTensorDataType(); | |||
| MallocTensorWithReuse(); | |||
| return RET_OK; | |||
| } | |||
| @@ -14,772 +14,109 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #include <iostream> | |||
| #include <memory> | |||
| #include "src/common/log_adapter.h" | |||
| #include "common/common_test.h" | |||
| #include "mindspore/lite/src/common/file_utils.h" | |||
| #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" | |||
| #include "mindspore/lite/src/runtime/opencl/opencl_allocator.h" | |||
| #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" | |||
| #include "mindspore/lite/nnacl/fp32/activation.h" | |||
| #include "mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h" | |||
| using mindspore::kernel::LiteKernel; | |||
| using mindspore::kernel::SubGraphOpenCLKernel; | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::lite::Tensor; | |||
| using mindspore::schema::ActivationType_HSWISH; | |||
| using mindspore::schema::ActivationType_LEAKY_RELU; | |||
| using mindspore::schema::ActivationType_RELU; | |||
| using mindspore::schema::ActivationType_RELU6; | |||
| using mindspore::schema::ActivationType_SIGMOID; | |||
| using mindspore::schema::ActivationType_SWISH; | |||
| using mindspore::schema::ActivationType_TANH; | |||
| using mindspore::schema::PrimitiveType_Activation; | |||
| #include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h" | |||
| namespace mindspore { | |||
| class TestActivationOpenCL : public mindspore::CommonTest {}; | |||
| class TestActivationOpenCLTanh : public mindspore::CommonTest {}; | |||
| void LoadActivationData(void *dst, size_t dst_size, const std::string &file_path) { | |||
| if (file_path.empty()) { | |||
| memset(dst, 0x00, dst_size); | |||
| } else { | |||
| auto src_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(file_path.c_str(), &dst_size)); | |||
| memcpy(dst, src_data, dst_size); | |||
| } | |||
| } | |||
| template <typename T> | |||
| void CompareRes(lite::Tensor *output_tensor, const std::string &standard_answer_file) { | |||
| auto *output_data = reinterpret_cast<T *>(output_tensor->data_c()); | |||
| size_t output_size = output_tensor->Size(); | |||
| auto expect_data = reinterpret_cast<T *>(mindspore::lite::ReadFile(standard_answer_file.c_str(), &output_size)); | |||
| constexpr float atol = 0.001; | |||
| for (int i = 0; i < output_tensor->ElementsNum(); ++i) { | |||
| if (std::fabs(output_data[i] - expect_data[i]) > atol) { | |||
| printf("error at idx[%d] expect=%f output=%f\n", i, expect_data[i], output_data[i]); | |||
| printf("error at idx[%d] expect=%f output=%f\n", i, expect_data[i], output_data[i]); | |||
| printf("error at idx[%d] expect=%f output=%f\n\n\n", i, expect_data[i], output_data[i]); | |||
| return; | |||
| } | |||
| } | |||
| printf("compare success!\n"); | |||
| printf("compare success!\n"); | |||
| printf("compare success!\n\n\n"); | |||
| } | |||
| template <typename T> | |||
| void printf_tensor(const std::string &str, mindspore::lite::Tensor *in_data) { | |||
| MS_LOG(INFO) << str; | |||
| auto input_data = reinterpret_cast<T *>(in_data->data_c()); | |||
| for (int i = 0; i < in_data->ElementsNum(); ++i) { | |||
| printf("%f ", input_data[i]); | |||
| } | |||
| printf("\n"); | |||
| MS_LOG(INFO) << "Print tensor done"; | |||
| } | |||
| TEST_F(TestActivationOpenCL, ReluFp_dim4) { | |||
| std::string in_file = "/data/local/tmp/in_data.bin"; | |||
| std::string out_file = "/data/local/tmp/relu.bin"; | |||
| MS_LOG(INFO) << "Relu Begin test!"; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | |||
| ocl_runtime->Init(); | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| auto data_type = kNumberTypeFloat16; | |||
| ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16); | |||
| bool enable_fp16 = ocl_runtime->GetFp16Enable(); | |||
| MS_LOG(INFO) << "Init tensors."; | |||
| std::vector<int> input_shape = {1, 9}; | |||
| schema::Format format = schema::Format_NC; | |||
| auto tensor_type = lite::Tensor::CONST_TENSOR; | |||
| auto *input_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); | |||
| if (input_tensor == nullptr) { | |||
| MS_LOG(ERROR) << "new input tensor error!"; | |||
| return; | |||
| } | |||
| auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); | |||
| if (output_tensor == nullptr) { | |||
| MS_LOG(ERROR) << "new output tensor error!"; | |||
| delete input_tensor; | |||
| return; | |||
| } | |||
| std::vector<lite::Tensor *> inputs{input_tensor}; | |||
| std::vector<lite::Tensor *> outputs{output_tensor}; | |||
| inputs[0]->MallocData(allocator); | |||
| LoadActivationData(inputs[0]->data_c(), inputs[0]->Size(), in_file); | |||
| if (enable_fp16) { | |||
| printf_tensor<float16_t>("ReluFp16:--input data---", inputs[0]); | |||
| } else { | |||
| printf_tensor<float>("ReluFp32:--input data---", inputs[0]); | |||
| } | |||
| auto *param = new (std::nothrow) ActivationParameter(); | |||
| if (param == nullptr) { | |||
| MS_LOG(ERROR) << "New ActivationParameter fail."; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| return; | |||
| } | |||
| param->type_ = ActivationType_RELU; | |||
| auto *kernel = | |||
| new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "Kernel:Relu create fail."; | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| return; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| delete param; | |||
| delete kernel; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| MS_LOG(ERROR) << "Init relu fail."; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel."; | |||
| std::vector<kernel::LiteKernel *> kernels{kernel}; | |||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| delete kernel; | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail."; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Initialize sub_graph."; | |||
| ret = sub_graph->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init sub_graph error."; | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| delete sub_graph; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Run SubGraphOpenCLKernel."; | |||
| ret = sub_graph->Run(); | |||
| if (ret != RET_OK) { | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| delete sub_graph; | |||
| MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error."; | |||
| return; | |||
| } | |||
| if (enable_fp16) { | |||
| printf_tensor<float16_t>("ReluFp16--output data---", outputs[0]); | |||
| CompareRes<float16_t>(output_tensor, out_file); | |||
| } else { | |||
| printf_tensor<float>("ReluFp32--output data--", outputs[0]); | |||
| CompareRes<float>(output_tensor, out_file); | |||
| } | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| delete sub_graph; | |||
| } | |||
| TEST_F(TestActivationOpenCL, Relu6Fp_dim4) { | |||
| std::string in_file = "/data/local/tmp/in_data.bin"; | |||
| std::string out_file = "/data/local/tmp/relu6.bin"; | |||
| MS_LOG(INFO) << "Relu6 Begin test!"; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | |||
| auto data_type = kNumberTypeFloat16; | |||
| ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16); | |||
| bool enable_fp16 = ocl_runtime->GetFp16Enable(); | |||
| ocl_runtime->Init(); | |||
| MS_LOG(INFO) << "Init tensors."; | |||
| std::vector<int> input_shape = {1, 9}; | |||
| schema::Format format = schema::Format_NC; | |||
| auto tensor_type = lite::Tensor::CONST_TENSOR; | |||
| auto *input_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); | |||
| if (input_tensor == nullptr) { | |||
| MS_LOG(ERROR) << "new input tensor error!"; | |||
| return; | |||
| } | |||
| auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); | |||
| if (output_tensor == nullptr) { | |||
| MS_LOG(ERROR) << "new output tensor error!"; | |||
| delete input_tensor; | |||
| return; | |||
| } | |||
| std::vector<lite::Tensor *> inputs{input_tensor}; | |||
| std::vector<lite::Tensor *> outputs{output_tensor}; | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| inputs[0]->MallocData(allocator); | |||
| MS_LOG(INFO) << "Initialize input data"; | |||
| LoadActivationData(inputs[0]->data_c(), inputs[0]->Size(), in_file); | |||
| if (enable_fp16) { | |||
| printf_tensor<float16_t>("Relu6:FP16--input data--", inputs[0]); | |||
| } else { | |||
| printf_tensor<float>("Relu6:FP32--input data--", inputs[0]); | |||
| } | |||
| auto *param = new (std::nothrow) ActivationParameter(); | |||
| if (param == nullptr) { | |||
| MS_LOG(ERROR) << "New ActivationParameter fail."; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| return; | |||
| } | |||
| param->type_ = ActivationType_RELU6; | |||
| auto *kernel = | |||
| new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "Kernel:Relu6 create fail."; | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| return; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| delete param; | |||
| delete kernel; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| MS_LOG(ERROR) << "Init relu6 fail."; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel."; | |||
| std::vector<kernel::LiteKernel *> kernels{kernel}; | |||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| delete kernel; | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail."; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Initialize sub_graph."; | |||
| ret = sub_graph->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init sub_graph error."; | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| delete sub_graph; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Run SubGraphOpenCLKernel."; | |||
| ret = sub_graph->Run(); | |||
| if (ret != RET_OK) { | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| delete sub_graph; | |||
| MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error."; | |||
| return; | |||
| } | |||
| class TestActivationOpenCL : public mindspore::CommonTest { | |||
| public: | |||
| TestActivationOpenCL() {} | |||
| }; | |||
| if (enable_fp16) { | |||
| printf_tensor<float16_t>("Relu6:FP16--output data---", outputs[0]); | |||
| CompareRes<float16_t>(output_tensor, out_file); | |||
| } else { | |||
| printf_tensor<float>("Relu6:FP32--output data---", outputs[0]); | |||
| CompareRes<float>(output_tensor, out_file); | |||
| } | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| delete sub_graph; | |||
| } | |||
| TEST_F(TestActivationOpenCL, SigmoidFp_dim4) { | |||
| std::string in_file = "/data/local/tmp/in_data.bin"; | |||
| std::string out_file = "/data/local/tmp/sigmoid.bin"; | |||
| MS_LOG(INFO) << "Sigmoid Begin test!"; | |||
| void RunTestCaseActivation(void *input_data0, const std::vector<int> &input_shape, void *output_data, | |||
| const std::vector<int> &out_shape, bool enable_fp16, int act_type) { | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | |||
| ocl_runtime->Init(); | |||
| auto data_type = kNumberTypeFloat32; | |||
| ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16); | |||
| bool enable_fp16 = ocl_runtime->GetFp16Enable(); | |||
| MS_LOG(INFO) << "Init tensors."; | |||
| std::vector<int> input_shape = {1, 9}; | |||
| schema::Format format = schema::Format_NC; | |||
| auto tensor_type = lite::Tensor::CONST_TENSOR; | |||
| auto *input_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); | |||
| if (input_tensor == nullptr) { | |||
| MS_LOG(ERROR) << "new input tensor error!"; | |||
| return; | |||
| } | |||
| auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); | |||
| if (output_tensor == nullptr) { | |||
| MS_LOG(ERROR) << "new output tensor error!"; | |||
| delete input_tensor; | |||
| return; | |||
| } | |||
| std::vector<lite::Tensor *> inputs{input_tensor}; | |||
| std::vector<lite::Tensor *> outputs{output_tensor}; | |||
| size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); | |||
| ocl_runtime->SetFp16Enable(enable_fp16); | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| inputs[0]->MallocData(allocator); | |||
| MS_LOG(INFO) << "Initialize input data"; | |||
| LoadActivationData(inputs[0]->data_c(), inputs[0]->Size(), in_file); | |||
| if (enable_fp16) { | |||
| printf_tensor<float16_t>("Sigmoid:FP16--input data--", inputs[0]); | |||
| } else { | |||
| printf_tensor<float>("Sigmoid:FP32--input data--", inputs[0]); | |||
| } | |||
| auto *param = new (std::nothrow) ActivationParameter(); | |||
| auto param = static_cast<ActivationParameter *>(malloc(sizeof(ActivationParameter))); | |||
| if (param == nullptr) { | |||
| MS_LOG(ERROR) << "New ActivationParameter fail."; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| return; | |||
| } | |||
| param->type_ = ActivationType_SIGMOID; | |||
| auto *kernel = | |||
| new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "Kernel:Sigmoid create fail."; | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| return; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| delete param; | |||
| delete kernel; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| MS_LOG(ERROR) << "Init sigmoid fail."; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel."; | |||
| std::vector<kernel::LiteKernel *> kernels{kernel}; | |||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| delete kernel; | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail."; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Initialize sub_graph."; | |||
| ret = sub_graph->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init sub_graph error."; | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| delete sub_graph; | |||
| MS_LOG(ERROR) << "param_ptr create error."; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Run SubGraphOpenCLKernel."; | |||
| ret = sub_graph->Run(); | |||
| if (ret != RET_OK) { | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| delete sub_graph; | |||
| MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error."; | |||
| param->op_parameter_.type_ = schema::PrimitiveType_Activation; | |||
| param->type_ = act_type; | |||
| auto tensor_x_ptr = | |||
| std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), input_shape); | |||
| auto tensor_x = tensor_x_ptr.get(); | |||
| if (tensor_x == nullptr) { | |||
| MS_LOG(ERROR) << "tensor_x create error."; | |||
| return; | |||
| } | |||
| if (enable_fp16) { | |||
| printf_tensor<float16_t>("Sigmoid:FP16--output data---", outputs[0]); | |||
| CompareRes<float16_t>(output_tensor, out_file); | |||
| } else { | |||
| printf_tensor<float>("Sigmoid:FP32--output data---", outputs[0]); | |||
| CompareRes<float>(output_tensor, out_file); | |||
| } | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| delete sub_graph; | |||
| } | |||
| TEST_F(TestActivationOpenCL, LeakyReluFp_dim4) { | |||
| std::string in_file = "/data/local/tmp/in_data.bin"; | |||
| std::string out_file = "/data/local/tmp/leaky_relu.bin"; | |||
| MS_LOG(INFO) << "Leaky relu Begin test!"; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | |||
| ocl_runtime->Init(); | |||
| auto data_type = kNumberTypeFloat16; | |||
| ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16); | |||
| bool enable_fp16 = ocl_runtime->GetFp16Enable(); | |||
| MS_LOG(INFO) << "Init tensors."; | |||
| std::vector<int> input_shape = {1, 9}; | |||
| auto tensor_type = lite::Tensor::CONST_TENSOR; | |||
| schema::Format format = schema::Format_NC; | |||
| auto *input_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); | |||
| if (input_tensor == nullptr) { | |||
| MS_LOG(ERROR) << "new input tensor error!"; | |||
| auto tensor_out_ptr = | |||
| std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), out_shape); | |||
| auto tensor_out = tensor_out_ptr.get(); | |||
| if (tensor_out == nullptr) { | |||
| MS_LOG(ERROR) << "tensor_out create error."; | |||
| return; | |||
| } | |||
| auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); | |||
| if (output_tensor == nullptr) { | |||
| MS_LOG(ERROR) << "new output tensor error!"; | |||
| delete input_tensor; | |||
| std::vector<lite::Tensor *> inputs{tensor_x}; | |||
| std::vector<lite::Tensor *> outputs{tensor_out}; | |||
| auto op_kernel = kernel::OpenCLKernelCreator<kernel::ActivationOpenCLKernel>( | |||
| inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr); | |||
| if (op_kernel == nullptr) { | |||
| MS_LOG(ERROR) << "op_kernel create error."; | |||
| return; | |||
| } | |||
| std::vector<lite::Tensor *> inputs{input_tensor}; | |||
| std::vector<lite::Tensor *> outputs{output_tensor}; | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| inputs[0]->MallocData(allocator); | |||
| MS_LOG(INFO) << "Initialize input data"; | |||
| LoadActivationData(inputs[0]->data_c(), inputs[0]->Size(), in_file); | |||
| if (enable_fp16) { | |||
| printf_tensor<float16_t>("Leaky Relu:FP16--input data--", inputs[0]); | |||
| } else { | |||
| printf_tensor<float>("Leaky Relu:FP32--input data--", inputs[0]); | |||
| } | |||
| auto *param = new (std::nothrow) ActivationParameter(); | |||
| if (param == nullptr) { | |||
| MS_LOG(ERROR) << "New ActivationParameter fail."; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| return; | |||
| } | |||
| param->alpha_ = 0.3f; | |||
| param->type_ = ActivationType_LEAKY_RELU; | |||
| auto *kernel = | |||
| new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "Kernel:leaky relu create fail."; | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| return; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| delete param; | |||
| delete kernel; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| MS_LOG(ERROR) << "Init leaky relu fail."; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel."; | |||
| std::vector<kernel::LiteKernel *> kernels{kernel}; | |||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| delete kernel; | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail."; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Initialize sub_graph."; | |||
| ret = sub_graph->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init sub_graph error."; | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| delete sub_graph; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Run SubGraphOpenCLKernel."; | |||
| ret = sub_graph->Run(); | |||
| if (ret != RET_OK) { | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| delete sub_graph; | |||
| MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error."; | |||
| return; | |||
| } | |||
| if (enable_fp16) { | |||
| printf_tensor<float16_t>("Leaky Relu:FP16--output data---", outputs[0]); | |||
| CompareRes<float16_t>(output_tensor, out_file); | |||
| } else { | |||
| printf_tensor<float>("Leaky Relu:FP32--output data---", outputs[0]); | |||
| CompareRes<float>(output_tensor, out_file); | |||
| } | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| } | |||
| TEST_F(TestActivationOpenCLTanh, TanhFp_dim4) { | |||
| std::string in_file = "/data/local/tmp/test_data/in_tanhfp16.bin"; | |||
| std::string out_file = "/data/local/tmp/test_data/out_tanhfp16.bin"; | |||
| MS_LOG(INFO) << "Tanh Begin test!"; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | |||
| ocl_runtime->Init(); | |||
| auto data_type = kNumberTypeFloat16; | |||
| ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16); | |||
| bool enable_fp16 = ocl_runtime->GetFp16Enable(); | |||
| std::vector<kernel::LiteKernel *> kernels{op_kernel}; | |||
| MS_LOG(INFO) << "Init tensors."; | |||
| std::vector<int> input_shape = {1, 2, 3, 9}; | |||
| schema::Format format = schema::Format_NHWC; | |||
| auto tensor_type = lite::Tensor::CONST_TENSOR; | |||
| auto *input_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); | |||
| if (input_tensor == nullptr) { | |||
| MS_LOG(ERROR) << "new input tensor error!"; | |||
| return; | |||
| } | |||
| auto *output_tensor = new (std::nothrow) lite::Tensor(data_type, input_shape, format, tensor_type); | |||
| if (output_tensor == nullptr) { | |||
| MS_LOG(ERROR) << "new output tensor error!"; | |||
| delete input_tensor; | |||
| std::vector<lite::Tensor *> inputs_g{tensor_x}; | |||
| auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs_g, outputs, kernels, kernels, kernels); | |||
| auto pGraph = pGraph_ptr.get(); | |||
| if (pGraph == nullptr) { | |||
| MS_LOG(ERROR) << "pGraph create error."; | |||
| return; | |||
| } | |||
| std::vector<lite::Tensor *> inputs{input_tensor}; | |||
| std::vector<lite::Tensor *> outputs{output_tensor}; | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| inputs[0]->MallocData(allocator); | |||
| MS_LOG(INFO) << "Initialize input data"; | |||
| LoadActivationData(inputs[0]->data_c(), inputs[0]->Size(), in_file); | |||
| pGraph->Init(); | |||
| memcpy(inputs[0]->MutableData(), input_data0, tensor_x->ElementsNum() * dtype_size); | |||
| pGraph->Run(); | |||
| if (enable_fp16) { | |||
| printf_tensor<float16_t>("Tanh:FP16--input data--", inputs[0]); | |||
| CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float16_t>(1e-3), | |||
| 2e-2); | |||
| } else { | |||
| printf_tensor<float>("Tanh:FP32--input data--", inputs[0]); | |||
| } | |||
| auto param = reinterpret_cast<ActivationParameter *>(malloc(sizeof(ActivationParameter))); | |||
| if (param == nullptr) { | |||
| MS_LOG(ERROR) << "New ActivationParameter fail."; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| return; | |||
| } | |||
| param->type_ = ActivationType_TANH; | |||
| auto *kernel = | |||
| new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "Kernel:Tanh create fail."; | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| return; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| delete param; | |||
| delete kernel; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| MS_LOG(ERROR) << "Init tanh fail."; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel."; | |||
| std::vector<kernel::LiteKernel *> kernels{kernel}; | |||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| delete kernel; | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail."; | |||
| return; | |||
| CompareOutput(outputs[0]->MutableData(), output_data, tensor_out->ElementsNum(), static_cast<float>(1e-5)); | |||
| } | |||
| MS_LOG(INFO) << "Initialize sub_graph."; | |||
| ret = sub_graph->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init sub_graph error."; | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| delete sub_graph; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Run SubGraphOpenCLKernel."; | |||
| ret = sub_graph->Run(); | |||
| if (ret != RET_OK) { | |||
| delete param; | |||
| delete input_tensor; | |||
| delete output_tensor; | |||
| delete sub_graph; | |||
| MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error."; | |||
| return; | |||
| for (auto t : inputs) { | |||
| t->set_data(nullptr); | |||
| } | |||
| if (enable_fp16) { | |||
| printf_tensor<float16_t>("Tanh:FP16--output data---", outputs[0]); | |||
| CompareRes<float16_t>(output_tensor, out_file); | |||
| } else { | |||
| printf_tensor<float>("Tanh:FP32--output data---", outputs[0]); | |||
| CompareRes<float>(output_tensor, out_file); | |||
| for (auto t : outputs) { | |||
| t->set_data(nullptr); | |||
| } | |||
| input_tensor->set_data(nullptr); | |||
| delete input_tensor; | |||
| output_tensor->set_data(nullptr); | |||
| delete output_tensor; | |||
| delete sub_graph; | |||
| MS_LOG(INFO) << "TestActivation passed"; | |||
| } | |||
| TEST_F(TestActivationOpenCL, SwishFp16_dim4) { | |||
| size_t input_size; | |||
| std::string in_file = "/data/local/tmp/test_data/in_swishfp16.bin"; | |||
| std::string out_file = "/data/local/tmp/test_data/out_swishfp16.bin"; | |||
| auto input_data = reinterpret_cast<float16_t *>(mindspore::lite::ReadFile(in_file.c_str(), &input_size)); | |||
| MS_LOG(INFO) << "Swish Begin test!"; | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper(); | |||
| auto runtime = ocl_runtime.GetInstance(); | |||
| runtime->Init(); | |||
| auto data_type = kNumberTypeFloat16; | |||
| runtime->SetFp16Enable(data_type == kNumberTypeFloat16); | |||
| bool enable_fp16 = runtime->GetFp16Enable(); | |||
| MS_LOG(INFO) << "Init tensors."; | |||
| std::vector<int> input_shape = {1, 2, 3, 9}; | |||
| schema::Format format = schema::Format_NHWC; | |||
| auto tensor_type = lite::Tensor::CONST_TENSOR; | |||
| auto input_tensor = Tensor(data_type, input_shape, format, tensor_type); | |||
| auto output_tensor = Tensor(data_type, input_shape, format, tensor_type); | |||
| std::vector<lite::Tensor *> inputs{&input_tensor}; | |||
| std::vector<lite::Tensor *> outputs{&output_tensor}; | |||
| auto allocator = runtime->GetAllocator(); | |||
| inputs[0]->MallocData(allocator); | |||
| MS_LOG(INFO) << "Initialize input data"; | |||
| memcpy(inputs[0]->data_c(), input_data, input_size); | |||
| if (enable_fp16) { | |||
| printf_tensor<float16_t>("Swish:FP16--input data--", inputs[0]); | |||
| } else { | |||
| printf_tensor<float>("Swish:FP32--input data--", inputs[0]); | |||
| } | |||
| auto param = reinterpret_cast<ActivationParameter *>(malloc(sizeof(ActivationParameter))); | |||
| if (param == nullptr) { | |||
| MS_LOG(ERROR) << "New ActivationParameter fail."; | |||
| return; | |||
| } | |||
| param->type_ = ActivationType_SWISH; | |||
| auto *kernel = | |||
| new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "Kernel:Swish create fail."; | |||
| delete param; | |||
| return; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| delete param; | |||
| delete kernel; | |||
| MS_LOG(ERROR) << "Init Swish fail."; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel."; | |||
| std::vector<kernel::LiteKernel *> kernels{kernel}; | |||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| delete kernel; | |||
| delete param; | |||
| MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail."; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Initialize sub_graph."; | |||
| ret = sub_graph->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init sub_graph error."; | |||
| delete sub_graph; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Run SubGraphOpenCLKernel."; | |||
| ret = sub_graph->Run(); | |||
| if (ret != RET_OK) { | |||
| delete param; | |||
| delete sub_graph; | |||
| MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error."; | |||
| return; | |||
| } | |||
| CompareRes<float16_t>(&output_tensor, out_file); | |||
| delete sub_graph; | |||
| TEST_F(TestActivationOpenCL, ActivationReLUFp32) { | |||
| int n = 1; | |||
| int h = 2; | |||
| int w = 2; | |||
| int c = 3; | |||
| std::vector<int> in_shape0 = {n, h, w, c}; | |||
| std::vector<int> out_shape = {n, h, w, c}; | |||
| std::vector<float> input_data = {-1.0f, 1.0f, 2.0f, 3.0f, -1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, 9.0f}; | |||
| std::vector<float> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 3.0f, 0.0f, 5.0f, 0.0f, 7.0f, 9.0f}; | |||
| RunTestCaseActivation(input_data.data(), in_shape0, output_data.data(), out_shape, false, | |||
| schema::ActivationType_RELU); | |||
| } | |||
| TEST_F(TestActivationOpenCL, HSwishFp16_dim4) { | |||
| MS_LOG(INFO) << " begin test "; | |||
| auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); | |||
| auto runtime = runtime_wrapper.GetInstance(); | |||
| runtime->Init(); | |||
| auto allocator = runtime->GetAllocator(); | |||
| std::vector<int> input_shape = {1, 1, 2, 4}; | |||
| std::vector<int> output_shape = {1, 1, 2, 4}; | |||
| auto data_type = kNumberTypeFloat32; | |||
| auto tensor_type = lite::Tensor::CONST_TENSOR; | |||
| schema::Format format = schema::Format_NHWC; | |||
| float input_data[] = {-3.0, -2.0, -1.0, 0.0, 1.0, 5.0, 6.0, 7.0}; | |||
| float correctOutput[] = {-0, -0.33333334, -0.33333334, 0, 0.6666667, 5, 6, 7}; | |||
| MS_LOG(INFO) << "Init tensors."; | |||
| auto output_tensor = Tensor(data_type, input_shape, format, tensor_type); | |||
| auto in_tensor = Tensor(data_type, output_shape, format, tensor_type); | |||
| std::vector<lite::Tensor *> inputs{&in_tensor}; | |||
| std::vector<lite::Tensor *> outputs{&output_tensor}; | |||
| runtime->SetFp16Enable(data_type == kNumberTypeFloat16); | |||
| MS_LOG(INFO) << "Initialize input data"; | |||
| auto param = reinterpret_cast<ActivationParameter *>(malloc(sizeof(ActivationParameter))); | |||
| if (param == nullptr) { | |||
| MS_LOG(ERROR) << "New ActivationParameter fail."; | |||
| return; | |||
| } | |||
| param->type_ = ActivationType_HSWISH; | |||
| auto *kernel = | |||
| new (std::nothrow) kernel::ActivationOpenClKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "Kernel:HSwish create fail."; | |||
| delete param; | |||
| return; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| delete param; | |||
| delete kernel; | |||
| MS_LOG(ERROR) << "Init HSwish fail."; | |||
| return; | |||
| } | |||
| inputs[0]->MallocData(allocator); | |||
| memcpy(inputs[0]->data_c(), input_data, sizeof(input_data)); | |||
| MS_LOG(INFO) << "Create kernel SubGraphOpenCLKernel."; | |||
| std::vector<kernel::LiteKernel *> kernels{kernel}; | |||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| delete kernel; | |||
| delete param; | |||
| MS_LOG(ERROR) << "Kernel SubGraphOpenCLKernel create fail."; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Initialize sub_graph."; | |||
| ret = sub_graph->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Init sub_graph error."; | |||
| delete sub_graph; | |||
| return; | |||
| } | |||
| MS_LOG(INFO) << "Run SubGraphOpenCLKernel."; | |||
| ret = sub_graph->Run(); | |||
| if (ret != RET_OK) { | |||
| delete param; | |||
| delete sub_graph; | |||
| MS_LOG(ERROR) << "Run SubGraphOpenCLKernel error."; | |||
| return; | |||
| } | |||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor.data_c()); | |||
| CompareOutputData(output_data_gpu, correctOutput, output_tensor.ElementsNum(), 0.0001); | |||
| delete sub_graph; | |||
| TEST_F(TestActivationOpenCL, ActivationReLUFp16) { | |||
| int n = 1; | |||
| int h = 2; | |||
| int w = 2; | |||
| int c = 3; | |||
| std::vector<int> in_shape0 = {n, h, w, c}; | |||
| std::vector<int> out_shape = {n, h, w, c}; | |||
| std::vector<float16_t> input_data = {-1.0f, 1.0f, 2.0f, 3.0f, -1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, 9.0f}; | |||
| std::vector<float16_t> output_data = {0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 0.0f, 3.0f, 0.0f, 5.0f, 0.0f, 7.0f, 9.0f}; | |||
| RunTestCaseActivation(input_data.data(), in_shape0, output_data.data(), out_shape, true, schema::ActivationType_RELU); | |||
| } | |||
| } // namespace mindspore | |||
| @@ -75,14 +75,12 @@ void RunTestCaseArithmetic(void *input_data0, const std::vector<int> &input_shap | |||
| } | |||
| std::vector<lite::Tensor *> inputs{tensor_x, tensor_w}; | |||
| std::vector<lite::Tensor *> outputs{tensor_out}; | |||
| auto op_kernel_ptr = | |||
| std::make_unique<kernel::ArithmeticOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| auto op_kernel = op_kernel_ptr.release(); | |||
| auto op_kernel = kernel::OpenCLKernelCreator<kernel::ArithmeticOpenCLKernel>( | |||
| inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr); | |||
| if (op_kernel == nullptr) { | |||
| MS_LOG(ERROR) << "op_kernel create error."; | |||
| return; | |||
| } | |||
| op_kernel->Init(); | |||
| inputs[0]->MallocData(allocator); | |||
| std::vector<kernel::LiteKernel *> kernels{op_kernel}; | |||
| @@ -99,15 +99,13 @@ void RunTestCaseConv2dTranspose(const std::vector<int> &shape, void *input_data, | |||
| opParameter->pad_l_ = pad; | |||
| opParameter->input_channel_ = ci; | |||
| opParameter->output_channel_ = co; | |||
| auto op_kernel_ptr = std::make_unique<kernel::Conv2dTransposeOpenCLKernel>( | |||
| reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| auto op_kernel = op_kernel_ptr.release(); | |||
| auto op_kernel = kernel::OpenCLKernelCreator<kernel::Conv2dTransposeOpenCLKernel>( | |||
| inputs, outputs, reinterpret_cast<OpParameter *>(opParameter), nullptr, kernel::KernelKey(), nullptr); | |||
| if (op_kernel == nullptr) { | |||
| MS_LOG(ERROR) << "op_kernel create error."; | |||
| return; | |||
| } | |||
| op_kernel->set_name("DeConv"); | |||
| op_kernel->Init(); | |||
| inputs[0]->MallocData(allocator); | |||
| std::vector<kernel::LiteKernel *> kernels{op_kernel}; | |||
| @@ -63,6 +63,7 @@ void RunTestCaseFullConnection(const std::vector<int> &shape, void *input_data, | |||
| param->a_transpose_ = false; | |||
| param->b_transpose_ = true; | |||
| param->has_bias_ = true; | |||
| param->act_type_ = ActType_No; | |||
| auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), | |||
| input_shape, dims == 2 ? schema::Format_NC : schema::Format_NHWC); | |||
| auto tensor_x = tensor_x_ptr.get(); | |||
| @@ -98,14 +99,12 @@ void RunTestCaseFullConnection(const std::vector<int> &shape, void *input_data, | |||
| } | |||
| std::vector<lite::Tensor *> inputs{tensor_x, tensor_w, tensor_bias}; | |||
| std::vector<lite::Tensor *> outputs{tensor_out}; | |||
| auto op_kernel_ptr = | |||
| std::make_unique<kernel::FullConnectionOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| auto op_kernel = op_kernel_ptr.release(); | |||
| auto op_kernel = kernel::OpenCLKernelCreator<kernel::FullConnectionOpenCLKernel>( | |||
| inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr); | |||
| if (op_kernel == nullptr) { | |||
| MS_LOG(ERROR) << "op_kernel create error."; | |||
| return; | |||
| } | |||
| op_kernel->Init(); | |||
| inputs[0]->MallocData(allocator); | |||
| std::vector<kernel::LiteKernel *> kernels{op_kernel}; | |||
| @@ -87,14 +87,12 @@ void RunTestCaseMatMul(const std::vector<int> &shape, void *input_data, void *we | |||
| } | |||
| std::vector<lite::Tensor *> inputs{tensor_x, tensor_w}; | |||
| std::vector<lite::Tensor *> outputs{tensor_out}; | |||
| auto op_kernel_ptr = | |||
| std::make_unique<kernel::MatMulOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| auto op_kernel = op_kernel_ptr.release(); | |||
| auto op_kernel = kernel::OpenCLKernelCreator<kernel::MatMulOpenCLKernel>( | |||
| inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr); | |||
| if (op_kernel == nullptr) { | |||
| MS_LOG(ERROR) << "op_kernel create error."; | |||
| return; | |||
| } | |||
| op_kernel->Init(); | |||
| inputs[0]->MallocData(allocator); | |||
| std::vector<kernel::LiteKernel *> kernels{op_kernel}; | |||
| @@ -1,158 +0,0 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <iostream> | |||
| #include <memory> | |||
| #include "src/common/log_adapter.h" | |||
| #include "common/common_test.h" | |||
| #include "mindspore/lite/src/common/file_utils.h" | |||
| #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" | |||
| #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" | |||
| #include "mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h" | |||
| #include "mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.h" | |||
| namespace mindspore { | |||
| class TestMaxPoolingOpenCL : public mindspore::CommonTest {}; | |||
| void InitMaxPoolingParam(PoolingParameter *param) { | |||
| param->input_batch_ = 1; | |||
| param->input_h_ = 2; | |||
| param->input_w_ = 2; | |||
| param->input_channel_ = 4; | |||
| param->output_batch_ = 1; | |||
| param->output_h_ = 1; | |||
| param->output_w_ = 1; | |||
| param->output_channel_ = 4; | |||
| param->window_h_ = 2; | |||
| param->window_w_ = 2; | |||
| param->stride_h_ = 2; | |||
| param->stride_w_ = 2; | |||
| param->pad_u_ = 0; | |||
| param->pad_d_ = 0; | |||
| param->pad_l_ = 0; | |||
| param->pad_r_ = 0; | |||
| param->pool_mode_ = PoolMode_MaxPool; | |||
| } | |||
| void RunTestCaseMaxPooling(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) { | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | |||
| ocl_runtime->Init(); | |||
| size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); | |||
| ocl_runtime->SetFp16Enable(enable_fp16); | |||
| auto allocator = ocl_runtime->GetAllocator(); | |||
| int n = shape[0]; | |||
| int h = shape[1]; | |||
| int w = shape[2]; | |||
| int c = shape[3]; | |||
| int oh = shape[4]; | |||
| int ow = shape[5]; | |||
| auto param = static_cast<PoolingParameter *>(malloc(sizeof(PoolingParameter))); | |||
| if (param == nullptr) { | |||
| MS_LOG(ERROR) << "param create error."; | |||
| return; | |||
| } | |||
| InitMaxPoolingParam(param); | |||
| std::vector<int> input_shape = {n, h, w, c}; | |||
| auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), | |||
| input_shape, schema::Format_NHWC); | |||
| auto tensor_x = tensor_x_ptr.get(); | |||
| if (tensor_x == nullptr) { | |||
| MS_LOG(ERROR) << "tensor_x create error."; | |||
| return; | |||
| } | |||
| std::vector<int> out_shape = {n, oh, ow, c}; | |||
| auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), | |||
| out_shape, schema::Format_NHWC); | |||
| auto tensor_out = tensor_out_ptr.get(); | |||
| if (tensor_out == nullptr) { | |||
| MS_LOG(ERROR) << "tensor_out create error."; | |||
| return; | |||
| } | |||
| std::vector<lite::Tensor *> inputs{tensor_x}; | |||
| std::vector<lite::Tensor *> outputs{tensor_out}; | |||
| auto arith_kernel_ptr = | |||
| std::make_unique<kernel::PoolingOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| auto arith_kernel = arith_kernel_ptr.release(); | |||
| if (arith_kernel == nullptr) { | |||
| MS_LOG(ERROR) << "arith_kernel create error."; | |||
| return; | |||
| } | |||
| arith_kernel->Init(); | |||
| inputs[0]->MallocData(allocator); | |||
| std::vector<kernel::LiteKernel *> kernels{arith_kernel}; | |||
| auto pGraph_ptr = std::make_unique<kernel::SubGraphOpenCLKernel>(inputs, outputs, kernels, kernels, kernels); | |||
| auto pGraph = pGraph_ptr.get(); | |||
| if (pGraph == nullptr) { | |||
| MS_LOG(ERROR) << "pGraph create error."; | |||
| return; | |||
| } | |||
| pGraph->Init(); | |||
| memcpy(inputs[0]->MutableData(), input_data, inputs[0]->ElementsNum() * dtype_size); | |||
| pGraph->Run(); | |||
| if (enable_fp16) { | |||
| CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float16_t>(1e-3), | |||
| 2e-2); | |||
| } else { | |||
| CompareOutput(outputs[0]->MutableData(), output_data, outputs[0]->ElementsNum(), static_cast<float>(1e-5)); | |||
| } | |||
| for (auto t : inputs) { | |||
| t->set_data(nullptr); | |||
| } | |||
| for (auto t : outputs) { | |||
| t->set_data(nullptr); | |||
| } | |||
| MS_LOG(INFO) << "Test MaxPool2d passed"; | |||
| } | |||
| TEST_F(TestMaxPoolingOpenCL, MaxPoolingFp32) { | |||
| int n = 1; | |||
| int h = 2; | |||
| int w = 2; | |||
| int c = 4; | |||
| int oh = 1; | |||
| int ow = 1; | |||
| std::vector<int> shape = {n, h, w, c, oh, ow}; | |||
| std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, | |||
| 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; | |||
| std::vector<float> output_data = {12.0f, 13.0f, 14.0f, 15.0f}; | |||
| RunTestCaseMaxPooling(shape, input_data.data(), output_data.data(), false); | |||
| } | |||
| TEST_F(TestMaxPoolingOpenCL, MaxPoolingFp16) { | |||
| int n = 1; | |||
| int h = 2; | |||
| int w = 2; | |||
| int c = 4; | |||
| int oh = 1; | |||
| int ow = 1; | |||
| std::vector<int> shape = {n, h, w, c, oh, ow}; | |||
| std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, | |||
| 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; | |||
| std::vector<float16_t> output_data = {12.0f, 13.0f, 14.0f, 15.0f}; | |||
| RunTestCaseMaxPooling(shape, input_data.data(), output_data.data(), true); | |||
| } | |||
| } // namespace mindspore | |||
| @@ -25,9 +25,9 @@ | |||
| namespace mindspore { | |||
| class TestAvgPoolingOpenCL : public mindspore::CommonTest {}; | |||
| class TestPoolingOpenCL : public mindspore::CommonTest {}; | |||
| void InitAvgPoolingParam(PoolingParameter *param) { | |||
| void InitPoolingParam(PoolingParameter *param) { | |||
| param->input_batch_ = 1; | |||
| param->input_h_ = 2; | |||
| param->input_w_ = 2; | |||
| @@ -48,11 +48,10 @@ void InitAvgPoolingParam(PoolingParameter *param) { | |||
| param->pad_d_ = 0; | |||
| param->pad_l_ = 0; | |||
| param->pad_r_ = 0; | |||
| param->pool_mode_ = PoolMode_AvgPool; | |||
| } | |||
| void RunTestCaseAvgPooling(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16) { | |||
| void RunTestCasePooling(const std::vector<int> &shape, void *input_data, void *output_data, bool enable_fp16, | |||
| PoolMode pool_mode) { | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); | |||
| ocl_runtime->Init(); | |||
| size_t dtype_size = enable_fp16 ? sizeof(float16_t) : sizeof(float); | |||
| @@ -69,7 +68,8 @@ void RunTestCaseAvgPooling(const std::vector<int> &shape, void *input_data, void | |||
| MS_LOG(ERROR) << "param create error."; | |||
| return; | |||
| } | |||
| InitAvgPoolingParam(param); | |||
| InitPoolingParam(param); | |||
| param->pool_mode_ = pool_mode; | |||
| std::vector<int> input_shape = {n, h, w, c}; | |||
| auto tensor_x_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32), | |||
| input_shape, schema::Format_NHWC); | |||
| @@ -88,14 +88,12 @@ void RunTestCaseAvgPooling(const std::vector<int> &shape, void *input_data, void | |||
| } | |||
| std::vector<lite::Tensor *> inputs{tensor_x}; | |||
| std::vector<lite::Tensor *> outputs{tensor_out}; | |||
| auto arith_kernel_ptr = | |||
| std::make_unique<kernel::PoolingOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| auto arith_kernel = arith_kernel_ptr.release(); | |||
| auto arith_kernel = kernel::OpenCLKernelCreator<kernel::PoolingOpenCLKernel>( | |||
| inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr); | |||
| if (arith_kernel == nullptr) { | |||
| MS_LOG(ERROR) << "arith_kernel create error."; | |||
| return; | |||
| } | |||
| arith_kernel->Init(); | |||
| inputs[0]->MallocData(allocator); | |||
| @@ -127,7 +125,7 @@ void RunTestCaseAvgPooling(const std::vector<int> &shape, void *input_data, void | |||
| MS_LOG(INFO) << "Test AvgPool2d passed"; | |||
| } | |||
| TEST_F(TestAvgPoolingOpenCL, AvgPoolingFp32) { | |||
| TEST_F(TestPoolingOpenCL, AvgPoolingFp32) { | |||
| int n = 1; | |||
| int h = 2; | |||
| int w = 2; | |||
| @@ -139,10 +137,10 @@ TEST_F(TestAvgPoolingOpenCL, AvgPoolingFp32) { | |||
| 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; | |||
| std::vector<float> output_data = {6.0f, 7.0f, 8.0f, 9.0f}; | |||
| RunTestCaseAvgPooling(shape, input_data.data(), output_data.data(), false); | |||
| RunTestCasePooling(shape, input_data.data(), output_data.data(), false, PoolMode_AvgPool); | |||
| } | |||
| TEST_F(TestAvgPoolingOpenCL, AvgPoolingFp16) { | |||
| TEST_F(TestPoolingOpenCL, AvgPoolingFp16) { | |||
| int n = 1; | |||
| int h = 2; | |||
| int w = 2; | |||
| @@ -154,6 +152,36 @@ TEST_F(TestAvgPoolingOpenCL, AvgPoolingFp16) { | |||
| 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; | |||
| std::vector<float16_t> output_data = {6.0f, 7.0f, 8.0f, 9.0f}; | |||
| RunTestCaseAvgPooling(shape, input_data.data(), output_data.data(), true); | |||
| RunTestCasePooling(shape, input_data.data(), output_data.data(), true, PoolMode_AvgPool); | |||
| } | |||
| TEST_F(TestPoolingOpenCL, MaxPoolingFp32) { | |||
| int n = 1; | |||
| int h = 2; | |||
| int w = 2; | |||
| int c = 4; | |||
| int oh = 1; | |||
| int ow = 1; | |||
| std::vector<int> shape = {n, h, w, c, oh, ow}; | |||
| std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, | |||
| 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; | |||
| std::vector<float> output_data = {12.0f, 13.0f, 14.0f, 15.0f}; | |||
| RunTestCasePooling(shape, input_data.data(), output_data.data(), false, PoolMode_MaxPool); | |||
| } | |||
| TEST_F(TestPoolingOpenCL, MaxPoolingFp16) { | |||
| int n = 1; | |||
| int h = 2; | |||
| int w = 2; | |||
| int c = 4; | |||
| int oh = 1; | |||
| int ow = 1; | |||
| std::vector<int> shape = {n, h, w, c, oh, ow}; | |||
| std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, | |||
| 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f}; | |||
| std::vector<float16_t> output_data = {12.0f, 13.0f, 14.0f, 15.0f}; | |||
| RunTestCasePooling(shape, input_data.data(), output_data.data(), true, PoolMode_MaxPool); | |||
| } | |||
| } // namespace mindspore | |||
| @@ -75,14 +75,12 @@ void RunTestCaseReduce(const std::vector<int> &shape, void *input_data, void *ou | |||
| } | |||
| std::vector<lite::Tensor *> inputs{tensor_x}; | |||
| std::vector<lite::Tensor *> outputs{tensor_out}; | |||
| auto arith_kernel_ptr = | |||
| std::make_unique<kernel::ReduceOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| auto arith_kernel = arith_kernel_ptr.release(); | |||
| auto arith_kernel = kernel::OpenCLKernelCreator<kernel::ReduceOpenCLKernel>( | |||
| inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr); | |||
| if (arith_kernel == nullptr) { | |||
| MS_LOG(ERROR) << "arith_kernel create error."; | |||
| return; | |||
| } | |||
| arith_kernel->Init(); | |||
| inputs[0]->MallocData(allocator); | |||
| @@ -54,13 +54,12 @@ void RunTestCaseReshape(const std::vector<int> &shape_in, const std::vector<int> | |||
| } | |||
| std::vector<lite::Tensor *> inputs{tensor_x}; | |||
| std::vector<lite::Tensor *> outputs{tensor_out}; | |||
| auto arith_kernel_ptr = std::make_unique<kernel::ReshapeOpenCLKernel>(nullptr, inputs, outputs); | |||
| auto arith_kernel = arith_kernel_ptr.release(); | |||
| auto arith_kernel = kernel::OpenCLKernelCreator<kernel::ReshapeOpenCLKernel>(inputs, outputs, nullptr, nullptr, | |||
| kernel::KernelKey(), nullptr); | |||
| if (arith_kernel == nullptr) { | |||
| MS_LOG(ERROR) << "arith_kernel create error."; | |||
| return; | |||
| } | |||
| arith_kernel->Init(); | |||
| inputs[0]->MallocData(allocator); | |||
| @@ -69,14 +69,12 @@ void RunTestCaseResize(const std::vector<int> &shape, void *input_data, void *ou | |||
| } | |||
| std::vector<lite::Tensor *> inputs{tensor_x}; | |||
| std::vector<lite::Tensor *> outputs{tensor_out}; | |||
| auto arith_kernel_ptr = | |||
| std::make_unique<kernel::ResizeOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| auto arith_kernel = arith_kernel_ptr.release(); | |||
| auto arith_kernel = kernel::OpenCLKernelCreator<kernel::ResizeOpenCLKernel>( | |||
| inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr); | |||
| if (arith_kernel == nullptr) { | |||
| MS_LOG(ERROR) << "arith_kernel create error."; | |||
| return; | |||
| } | |||
| arith_kernel->Init(); | |||
| inputs[0]->MallocData(allocator); | |||
| @@ -75,14 +75,12 @@ void RunTestCaseSoftmax(const std::vector<int> &shape, void *input_data, void *o | |||
| return; | |||
| } | |||
| opParameter->axis_ = axis; | |||
| auto arith_kernel_ptr = | |||
| std::make_unique<kernel::SoftmaxOpenCLKernel>(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| auto arith_kernel = arith_kernel_ptr.release(); | |||
| auto arith_kernel = kernel::OpenCLKernelCreator<kernel::SoftmaxOpenCLKernel>( | |||
| inputs, outputs, reinterpret_cast<OpParameter *>(opParameter), nullptr, kernel::KernelKey(), nullptr); | |||
| if (arith_kernel == nullptr) { | |||
| MS_LOG(ERROR) << "arith_kernel create error."; | |||
| return; | |||
| } | |||
| arith_kernel->Init(); | |||
| inputs[0]->MallocData(allocator); | |||
| @@ -67,14 +67,12 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out | |||
| } | |||
| std::vector<lite::Tensor *> inputs{tensor_x}; | |||
| std::vector<lite::Tensor *> outputs{tensor_out}; | |||
| auto arith_kernel_ptr = | |||
| std::make_unique<kernel::TransposeOpenCLKernel>(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| auto arith_kernel = arith_kernel_ptr.release(); | |||
| auto arith_kernel = kernel::OpenCLKernelCreator<kernel::TransposeOpenCLKernel>( | |||
| inputs, outputs, reinterpret_cast<OpParameter *>(param), nullptr, kernel::KernelKey(), nullptr); | |||
| if (arith_kernel == nullptr) { | |||
| MS_LOG(ERROR) << "arith_kernel create error."; | |||
| return; | |||
| } | |||
| arith_kernel->Init(); | |||
| inputs[0]->MallocData(allocator); | |||