diff --git a/mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.cc index df333f7df9..90f8068fdd 100644 --- a/mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.cc +++ b/mindspore/lite/src/runtime/gpu/opencl/opencl_runtime.cc @@ -374,11 +374,11 @@ int OpenCLRuntime::BuildKernel(const cl::Kernel &kernel, const std::string &prog if (fp16_enable_) { build_option += " -DFP16_ENABLE=1 -DFLT=half -DFLT4=half4 -DFLT16=half16 -DAS_FLT4=as_half4 -DAS_UINT4=as_ushort4 -DUINT4=ushort4" - " -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT=convert_half -DTO_FLT4=convert_half4"; + " -DTO_FLT=convert_half -DTO_FLT4=convert_half4"; } else { build_option += " -DFP16_ENABLE=0 -DFLT=float -DFLT4=float4 -DFLT16=float16 -DAS_FLT4=as_float4 -DAS_UINT4=as_uint4 -DUINT4=uint4" - " -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT=convert_float -DTO_FLT4=convert_float4"; + " -DTO_FLT=convert_float -DTO_FLT4=convert_float4"; } build_option += " -DMAX_IMAGE2D_WIDTH=" + std::to_string(max_image2d_width_); build_option = diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl index a2b67d3fa9..5efff4a7bf 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl @@ -11,7 +11,7 @@ __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { \ return; \ } \ - FLT4 result; + TYPE4 result; // axis = 1 #define DOConcat2inputaxis1_NHWC4 \ @@ -248,12 +248,12 @@ CONCAT2(2input, axis3, _NHWC4) int Align_OutShape = output_shape.w; \ int index_output = (IN * output_shape.y + IH) * stride_w + IW * Align_OutShape * C4NUM; -int doconcat(__read_only image2d_t input, __global FLT *output, int Align_Shape, int4 input_shape, int IN, int IH, +int doconcat(__read_only image2d_t input, __global TYPE *output, int Align_Shape, int4 input_shape, int IN, int IH, int Y, int index_output) { int Remainder = input_shape.w % C4NUM; for (int i = 0; i < Align_Shape; ++i) { - FLT4 result = READ_IMAGE(input, smp_none, (int2)((Y * Align_Shape + i), (IN * input_shape.y + IH))); - FLT result_temp[4] = {result.x, result.y, result.z, result.w}; + TYPE4 result = READ_IMAGE(input, smp_none, (int2)((Y * Align_Shape + i), (IN * input_shape.y + IH))); + TYPE result_temp[4] = {result.x, result.y, result.z, result.w}; if ((i + 1) * C4NUM <= input_shape.w) { for (int j = 0; j < C4NUM; ++j) { output[index_output++] = result_temp[j]; @@ -268,7 +268,7 @@ int doconcat(__read_only image2d_t input, __global FLT *output, int Align_Shape, } __kernel void ConcatInput2UnAlign_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1, - __global FLT *output, int4 input_shape0, int4 input_shape1, int stride_w, + __global TYPE *output, int4 input_shape0, int4 input_shape1, int stride_w, int4 output_shape) { CHECK_IDX_UNALIGN; index_output = doconcat(input0, output, Align_Shape0, input_shape0, IN, IH, Y, index_output); @@ -276,7 +276,7 @@ __kernel void ConcatInput2UnAlign_NHWC4(__read_only image2d_t input0, __read_onl } __kernel void ConcatInput3UnAlign_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1, - __read_only image2d_t input2, __global FLT *output, int4 input_shape0, + __read_only image2d_t input2, __global TYPE *output, int4 input_shape0, int4 input_shape1, int4 input_shape2, int stride_w, int4 output_shape) { CHECK_IDX_UNALIGN; int Align_Shape2 = UP_DIV(input_shape2.w, C4NUM); @@ -287,7 +287,7 @@ __kernel void ConcatInput3UnAlign_NHWC4(__read_only image2d_t input0, __read_onl __kernel void ConcatInput4UnAlign_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2, __read_only image2d_t input3, - __global FLT *output, int4 input_shape0, int4 input_shape1, int4 input_shape2, + __global TYPE *output, int4 input_shape0, int4 input_shape1, int4 input_shape2, int4 input_shape3, int stride_w, int4 output_shape) { CHECK_IDX_UNALIGN; int Align_Shape2 = UP_DIV(input_shape2.w, C4NUM), Align_Shape3 = UP_DIV(input_shape3.w, C4NUM); @@ -299,7 +299,7 @@ __kernel void ConcatInput4UnAlign_NHWC4(__read_only image2d_t input0, __read_onl __kernel void ConcatInput5UnAlign_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2, __read_only image2d_t input3, - __read_only image2d_t input4, __global FLT *output, int4 input_shape0, + __read_only image2d_t input4, __global TYPE *output, int4 input_shape0, int4 input_shape1, int4 input_shape2, int4 input_shape3, int4 input_shape4, int stride_w, int4 output_shape) { CHECK_IDX_UNALIGN; @@ -315,7 +315,7 @@ __kernel void ConcatInput5UnAlign_NHWC4(__read_only image2d_t input0, __read_onl __kernel void ConcatInput6UnAlign_NHWC4(__read_only image2d_t input0, __read_only image2d_t input1, __read_only image2d_t input2, __read_only image2d_t input3, __read_only image2d_t input4, __read_only image2d_t input5, - __global FLT *output, int4 input_shape0, int4 input_shape1, int4 input_shape2, + __global TYPE *output, int4 input_shape0, int4 input_shape1, int4 input_shape2, int4 input_shape3, int4 input_shape4, int4 input_shape5, int stride_w, int4 output_shape) { CHECK_IDX_UNALIGN; diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl index 36edf3a38f..fab23758b6 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl @@ -1,48 +1,43 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define C4NUM 4 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - -#define GATHER(SUFFIX, READ_IMAGE, WRITE_IMAGE, TYPE) \ - __kernel void gather_##SUFFIX(__write_only image2d_t dst_data, __read_only image2d_t src_data, \ - __global int *indices, int4 src_size, int4 dst_size, int indices_num, int axis) { \ - int X = get_global_id(0); \ - int Y = get_global_id(1); \ - int Z = get_global_id(2); \ - if (X >= dst_size.x || Y >= dst_size.y * dst_size.w || Z >= dst_size.z || dst_size.y == 0) { \ - return; \ - } \ - TYPE##4 res_data = (TYPE##4)(0.0f, 0.0f, 0.0f, 0.0f); \ - int batch = Y / dst_size.y; \ - int height = Y % dst_size.y; \ - if (axis == 0) { \ - res_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + Z, indices[batch] * src_size.y + height)); \ - } else if (axis == 1) { \ - res_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + Z, batch * src_size.y + indices[height])); \ - } else if (axis == 2) { \ - res_data = READ_IMAGE(src_data, smp_zero, (int2)(indices[X] * src_size.z + Z, batch * src_size.y + height)); \ - } else if (axis == 3) { \ - int offset[4] = {indices[Z * 4] / 4, indices[Z * 4 + 1] / 4, indices[Z * 4 + 2] / 4, indices[Z * 4 + 3] / 4}; \ - TYPE tmp[4]; \ - TYPE res_tmp[4]; \ - for (int i = 0; i < indices_num; ++i) { \ - TYPE##4 rd_data = (TYPE##4)(0.0f, 0.0f, 0.0f, 0.0f); \ - rd_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + offset[i], batch * src_size.y + height)); \ - if (i >= 1 && offset[i] != offset[i - 1]) { \ - rd_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + offset[i], batch * src_size.y + height)); \ - } \ - tmp[0] = rd_data.x; \ - tmp[1] = rd_data.y; \ - tmp[2] = rd_data.z; \ - tmp[3] = rd_data.w; \ - res_tmp[i] = tmp[indices[Z * 4 + i] % 4]; \ - } \ - res_data.x = res_tmp[0]; \ - res_data.y = res_tmp[1]; \ - res_data.z = res_tmp[2]; \ - res_data.w = res_tmp[3]; \ - } \ - WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, batch * dst_size.y + height), res_data); \ +__kernel void gather(__write_only image2d_t dst_data, __read_only image2d_t src_data, __global int *indices, + int4 src_size, int4 dst_size, int indices_num, int axis) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= dst_size.x || Y >= dst_size.y * dst_size.w || Z >= dst_size.z || dst_size.y == 0) { + return; } -// GATHER(SUFFIX, READ_IMAGE, WRITE_IMAGE, TYPE) -GATHER(float, READ_IMAGE, WRITE_IMAGE, FLT); -GATHER(int, read_imagei, write_imagei, int); + TYPE4 res_data = (TYPE4)(0.0f, 0.0f, 0.0f, 0.0f); + int batch = Y / dst_size.y; + int height = Y % dst_size.y; + if (axis == 0) { + res_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + Z, indices[batch] * src_size.y + height)); + } else if (axis == 1) { + res_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + Z, batch * src_size.y + indices[height])); + } else if (axis == 2) { + res_data = READ_IMAGE(src_data, smp_zero, (int2)(indices[X] * src_size.z + Z, batch * src_size.y + height)); + } else if (axis == 3) { + int offset[4] = {indices[Z * 4] / 4, indices[Z * 4 + 1] / 4, indices[Z * 4 + 2] / 4, indices[Z * 4 + 3] / 4}; + TYPE tmp[4]; + TYPE res_tmp[4]; + for (int i = 0; i < indices_num; ++i) { + TYPE4 rd_data = (TYPE4)(0.0f, 0.0f, 0.0f, 0.0f); + rd_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + offset[i], batch * src_size.y + height)); + if (i >= 1 && offset[i] != offset[i - 1]) { + rd_data = READ_IMAGE(src_data, smp_zero, (int2)(X * src_size.z + offset[i], batch * src_size.y + height)); + } + tmp[0] = rd_data.x; + tmp[1] = rd_data.y; + tmp[2] = rd_data.z; + tmp[3] = rd_data.w; + res_tmp[i] = tmp[indices[Z * 4 + i] % 4]; + } + res_data.x = res_tmp[0]; + res_data.y = res_tmp[1]; + res_data.z = res_tmp[2]; + res_data.w = res_tmp[3]; + } + WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, batch * dst_size.y + height), res_data); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/one_hot.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/one_hot.cl index 49ebd72ad8..2defe94368 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/one_hot.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/one_hot.cl @@ -22,7 +22,7 @@ __kernel void OneHotAxis0(__read_only image2d_t src_data, __write_only image2d_t int N = Z / out_shape.y; int H = Z % out_shape.y; int in_index = (H * out_shape.z + Y) * out_shape.w + X; - int4 indices = read_imagei(src_data, smp_zero, (int2)(in_index % in_image2d_shape.x, in_index / in_image2d_shape.x)); + int4 indices = READ_IMAGE(src_data, smp_zero, (int2)(in_index % in_image2d_shape.x, in_index / in_image2d_shape.x)); int *indices_int = (int *)&indices; for (int i = 0; i < C4NUM; i++) { if (support_neg_index != 0 && indices_int[i] < 0) { @@ -42,7 +42,7 @@ __kernel void OneHotAxis0(__read_only image2d_t src_data, __write_only image2d_t if (4 * X + 3 < C) { SET_ON_OR_OFF_VALUE(result.w, N, indices_int[3], on_value, off_value); } - write_imagef(dst_data, (int2)(Y * out_shape.w + X, Z), result); + WRITE_IMAGE(dst_data, (int2)(Y * out_shape.w + X, Z), result); } __kernel void OneHotAxis1(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 in_image2d_shape, @@ -54,7 +54,7 @@ __kernel void OneHotAxis1(__read_only image2d_t src_data, __write_only image2d_t int N = Z / out_shape.y; int H = Z % out_shape.y; int in_index = (N * out_shape.z + Y) * out_shape.w + X; - int4 indices = read_imagei(src_data, smp_zero, (int2)(in_index % in_image2d_shape.x, in_index / in_image2d_shape.x)); + int4 indices = READ_IMAGE(src_data, smp_zero, (int2)(in_index % in_image2d_shape.x, in_index / in_image2d_shape.x)); int *indices_int = (int *)&indices; for (int i = 0; i < C4NUM; i++) { if (support_neg_index != 0 && indices_int[i] < 0) { @@ -74,7 +74,7 @@ __kernel void OneHotAxis1(__read_only image2d_t src_data, __write_only image2d_t if (4 * X + 3 < C) { SET_ON_OR_OFF_VALUE(result.w, H, indices_int[3], on_value, off_value); } - write_imagef(dst_data, (int2)(Y * out_shape.w + X, Z), result); + WRITE_IMAGE(dst_data, (int2)(Y * out_shape.w + X, Z), result); } __kernel void OneHotAxis2(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 in_image2d_shape, @@ -86,7 +86,7 @@ __kernel void OneHotAxis2(__read_only image2d_t src_data, __write_only image2d_t int N = Z / out_shape.y; int H = Z % out_shape.y; int in_index = (N * out_shape.y + H) * out_shape.w + X; - int4 indices = read_imagei(src_data, smp_zero, (int2)(in_index % in_image2d_shape.x, in_index / in_image2d_shape.x)); + int4 indices = READ_IMAGE(src_data, smp_zero, (int2)(in_index % in_image2d_shape.x, in_index / in_image2d_shape.x)); int *indices_int = (int *)&indices; for (int i = 0; i < C4NUM; i++) { if (support_neg_index != 0 && indices_int[i] < 0) { @@ -106,7 +106,7 @@ __kernel void OneHotAxis2(__read_only image2d_t src_data, __write_only image2d_t if (4 * X + 3 < C) { SET_ON_OR_OFF_VALUE(result.w, Y, indices_int[3], on_value, off_value); } - write_imagef(dst_data, (int2)(Y * out_shape.w + X, Z), result); + WRITE_IMAGE(dst_data, (int2)(Y * out_shape.w + X, Z), result); } __kernel void OneHotAxis3(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 in_image2d_shape, @@ -121,7 +121,7 @@ __kernel void OneHotAxis3(__read_only image2d_t src_data, __write_only image2d_t int in_index_c4 = (N * out_shape.y + H) * ci4_size + Y / 4; int in_index_c4_remainder = Y % 4; int4 indices = - read_imagei(src_data, smp_zero, (int2)(in_index_c4 % in_image2d_shape.x, in_index_c4 / in_image2d_shape.x)); + READ_IMAGE(src_data, smp_zero, (int2)(in_index_c4 % in_image2d_shape.x, in_index_c4 / in_image2d_shape.x)); int *indices_int = (int *)&indices; int index_one = indices_int[in_index_c4_remainder]; if (support_neg_index != 0 && index_one < 0) { @@ -140,7 +140,7 @@ __kernel void OneHotAxis3(__read_only image2d_t src_data, __write_only image2d_t if (4 * X + 3 < C) { SET_ON_OR_OFF_VALUE(result.w, 4 * X + 3, index_one, on_value, off_value); } - write_imagef(dst_data, (int2)(Y * out_shape.w + X, Z), result); + WRITE_IMAGE(dst_data, (int2)(Y * out_shape.w + X, Z), result); } __kernel void OneHot2DAxis3(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 in_image2d_shape, @@ -150,7 +150,7 @@ __kernel void OneHot2DAxis3(__read_only image2d_t src_data, __write_only image2d int Z = get_global_id(2); // H * N (out_shape.h is 1, so N == Z) if (X >= out_shape.w || Y >= out_shape.z || Z >= out_shape.x * out_shape.y) return; int in_index_c4_remainder = Z % 4; - int4 indices = read_imagei(src_data, smp_zero, (int2)(Z / C4NUM, 0)); + int4 indices = READ_IMAGE(src_data, smp_zero, (int2)(Z / C4NUM, 0)); int *indices_int = (int *)&indices; int index_one = indices_int[in_index_c4_remainder]; if (support_neg_index != 0 && index_one < 0) { @@ -169,5 +169,5 @@ __kernel void OneHot2DAxis3(__read_only image2d_t src_data, __write_only image2d if (4 * X + 3 < C) { SET_ON_OR_OFF_VALUE(result.w, 4 * X + 3, index_one, on_value, off_value); } - write_imagef(dst_data, (int2)(Y * out_shape.w + X, Z), result); + WRITE_IMAGE(dst_data, (int2)(Y * out_shape.w + X, Z), result); } diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl index 2acccbdf9b..7be0cfdcf1 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl @@ -3,52 +3,47 @@ #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -#define RESHAPE_NHWC4(SUFFIX, READ_IMAGE, WRITE_IMAGE, TYPE) \ - __kernel void reshape_NHWC4_##SUFFIX(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 src_size, \ - int4 dst_size) { \ - int X = get_global_id(0); \ - int Y = get_global_id(1); \ - int CO4 = UP_DIV(dst_size.z, C4NUM); \ - int CO4_rem = dst_size.z % C4NUM; \ - if (X >= dst_size.x || Y > dst_size.y) { \ - return; \ - } \ - int CI4 = UP_DIV(src_size.x, C4NUM); \ - int CI4_rem = src_size.x % C4NUM; \ - CI4_rem = (CI4_rem == 0) ? C4NUM : CI4_rem; \ - int in_img_x = CI4 * src_size.y; \ - TYPE##4 res = (TYPE##4)(0.0f); \ - TYPE tmp[4]; \ - TYPE res_tmp[4]; \ - int gcnt = 0; \ - if (CO4_rem == 0 && ((CI4_rem & 0x3) == 0)) { \ - gcnt = X + dst_size.x * Y; \ - res = READ_IMAGE(src_data, smp_zero, (int2)(gcnt % in_img_x, gcnt / in_img_x)); \ - WRITE_IMAGE(dst_data, (int2)(X, Y), res); \ - } else { \ - int start = ((X / CO4 * dst_size.z + min(dst_size.z, (X % CO4) * C4NUM)) + dst_size.w * Y); \ - gcnt = start / src_size.x * CI4 + (start % src_size.x) / C4NUM; \ - start = start % src_size.x % C4NUM; \ - for (int i = 0, n = 0, j = start; i < C4NUM; ++n, j = 0) { \ - int X_src = (gcnt + n) % in_img_x; \ - res = READ_IMAGE(src_data, smp_zero, (int2)(X_src, (gcnt + n) / in_img_x)); \ - tmp[0] = res.x; \ - tmp[1] = res.y; \ - tmp[2] = res.z; \ - tmp[3] = res.w; \ - int k = (X_src % CI4) == (CI4 - 1) ? CI4_rem : C4NUM; \ - for (; j < k && i < C4NUM; ++j, ++i) { \ - res_tmp[i] = tmp[j]; \ - } \ - } \ - res.x = res_tmp[0]; \ - res.y = res_tmp[1]; \ - res.z = res_tmp[2]; \ - res.w = res_tmp[3]; \ - WRITE_IMAGE(dst_data, (int2)(X, Y), res); \ - } \ +__kernel void reshape_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, int4 src_size, + int4 dst_size) { + int X = get_global_id(0); + int Y = get_global_id(1); + int CO4 = UP_DIV(dst_size.z, C4NUM); + int CO4_rem = dst_size.z % C4NUM; + if (X >= dst_size.x || Y > dst_size.y) { + return; } - -// RESHAPE_NHWC4(SUFFIX, READ_IMAGE, WRITE_IMAGE, TYPE) -RESHAPE_NHWC4(float, READ_IMAGE, WRITE_IMAGE, FLT); -RESHAPE_NHWC4(int, read_imagei, write_imagei, int); + int CI4 = UP_DIV(src_size.x, C4NUM); + int CI4_rem = src_size.x % C4NUM; + CI4_rem = (CI4_rem == 0) ? C4NUM : CI4_rem; + int in_img_x = CI4 * src_size.y; + TYPE4 res = (TYPE4)(0.0f); + TYPE tmp[4]; + TYPE res_tmp[4]; + int gcnt = 0; + if (CO4_rem == 0 && ((CI4_rem & 0x3) == 0)) { + gcnt = X + dst_size.x * Y; + res = READ_IMAGE(src_data, smp_zero, (int2)(gcnt % in_img_x, gcnt / in_img_x)); + WRITE_IMAGE(dst_data, (int2)(X, Y), res); + } else { + int start = ((X / CO4 * dst_size.z + min(dst_size.z, (X % CO4) * C4NUM)) + dst_size.w * Y); + gcnt = start / src_size.x * CI4 + (start % src_size.x) / C4NUM; + start = start % src_size.x % C4NUM; + for (int i = 0, n = 0, j = start; i < C4NUM; ++n, j = 0) { + int X_src = (gcnt + n) % in_img_x; + res = READ_IMAGE(src_data, smp_zero, (int2)(X_src, (gcnt + n) / in_img_x)); + tmp[0] = res.x; + tmp[1] = res.y; + tmp[2] = res.z; + tmp[3] = res.w; + int k = (X_src % CI4) == (CI4 - 1) ? CI4_rem : C4NUM; + for (; j < k && i < C4NUM; ++j, ++i) { + res_tmp[i] = tmp[j]; + } + } + res.x = res_tmp[0]; + res.y = res_tmp[1]; + res.z = res_tmp[2]; + res.w = res_tmp[3]; + WRITE_IMAGE(dst_data, (int2)(X, Y), res); + } +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc index 614855db87..cebdee3c54 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc @@ -73,7 +73,15 @@ int ActivationOpenCLKernel::Prepare() { std::string program_name = "Activation"; ocl_runtime_->LoadSource(program_name, source); std::string kernel_name = GetActTypeString(type_); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeInt32) { + build_options_ext = {" -DTYPE=int -DTYPE4=int4 -DWRITE_IMAGE=write_imagei -DREAD_IMAGE=read_imagei "}; + } else if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DTYPE=float -DTYPE4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DTYPE=half -DTYPE4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); SetConstArgs(); SetGlobalLocal(); MS_LOG(DEBUG) << kernel_name << " init Done!"; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc index c60dfb2896..d54a027def 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc @@ -149,7 +149,15 @@ int ArgMinMaxOpenCLKernel::Prepare() { std::string source = argminmax_source; std::string program_name = "argminmax"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeInt32) { + build_options_ext = {" -DTYPE=int -DTYPE4=int4 -DWRITE_IMAGE=write_imagei -DREAD_IMAGE=read_imagei "}; + } else if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DTYPE=float -DTYPE4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DTYPE=half -DTYPE4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); #endif auto *param = reinterpret_cast(this->op_parameter_); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index 6aa355607e..3bc47793a3 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -191,7 +191,15 @@ int ArithmeticOpenCLKernel::Prepare() { std::string program_name = "Arithmetic"; std::string source = arithmetic_source; ocl_runtime_->LoadSource(program_name, source); - int error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name_); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeInt32) { + build_options_ext = {" -DTYPE=int -DTYPE4=int4 -DWRITE_IMAGE=write_imagei -DREAD_IMAGE=read_imagei "}; + } else if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DTYPE=float -DTYPE4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DTYPE=half -DTYPE4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + int error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name_, build_options_ext); #endif if (error_code != RET_OK) { return error_code; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc index 496c11e298..fc0f649690 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc @@ -89,7 +89,15 @@ int ArithmeticSelfOpenCLKernel::Prepare() { MS_LOG(DEBUG) << "execute kernel name : " << kernel_name; std::string program_name = "ArithmeticSelf"; ocl_runtime_->LoadSource(program_name, arithmeticself_source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeInt32) { + build_options_ext = {" -DTYPE=int -DTYPE4=int4 -DWRITE_IMAGE=write_imagei -DREAD_IMAGE=read_imagei "}; + } else if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DTYPE=float -DTYPE4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DTYPE=half -DTYPE4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); SetGlobalLocal(); SetConstArgs(); return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc index 224e61da37..c1aa40cebc 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc @@ -94,7 +94,13 @@ int BatchToSpaceNDOpenCLKernel::Prepare() { std::string source = batch_to_space_nd_source; std::string program_name = "batch_to_space_nd"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); #endif SetGlobalLocal(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc index 9e24456e8b..0590789a96 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc @@ -168,7 +168,13 @@ int BatchNormOpenCLKernel::Prepare() { std::string source = batchnorm_source; std::string program_name = "Batch_normalization"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); MS_LOG(DEBUG) << kernel_name << " Init Done!"; int ret = Initweight(); if (ret) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc index bb76fc7203..777b282c56 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc @@ -170,7 +170,24 @@ int ConcatOpenCLKernel::ConvertWeightToTensor() { bool src_is_fp16 = in_tensor->data_type() == kNumberTypeFloat16; PackNHWCToNHWC4(in_tensor->data_c(), weight.data(), src_is_fp16, fp16_enable && in_tensor->data_type() != kNumberTypeInt32, in_shape); - size_t dtype = fp16_enable && in_tensor->data_type() != kNumberTypeInt32 ? CL_HALF_FLOAT : CL_FLOAT; + size_t dtype; + switch (in_tensor->data_type()) { + case kNumberTypeInt32: { + dtype = CL_SIGNED_INT32; + break; + } + case kNumberTypeFloat32: { + dtype = CL_FLOAT; + break; + } + case kNumberTypeFloat16: { + dtype = CL_HALF_FLOAT; + break; + } + default: + MS_LOG(ERROR) << "Unsupported data type is" << in_tensor->data_type(); + return RET_ERROR; + } ImageSize img_size{in_shape.width, in_shape.height, dtype}; auto weight_ptr_ = allocator->Malloc(img_size, weight.data()); weight_ptrs_.push_back(weight_ptr_); @@ -207,7 +224,15 @@ int ConcatOpenCLKernel::Prepare() { std::string source = concat_source; std::string program_name = "Concat"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, {}); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeInt32) { + build_options_ext = {" -DTYPE=int -DTYPE4=int4 -DWRITE_IMAGE=write_imagei -DREAD_IMAGE=read_imagei "}; + } else if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DTYPE=float -DTYPE4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DTYPE=half -DTYPE4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); MS_LOG(DEBUG) << kernel_name << " Init Done!"; SetConstArgs(); SetGlobalLocal(); @@ -235,4 +260,5 @@ int ConcatOpenCLKernel::Run() { REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Concat, OpenCLKernelCreator) REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Concat, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeInt32, PrimitiveType_Concat, OpenCLKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc index 871b4b0b43..a7e33afed0 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc @@ -144,7 +144,13 @@ void Conv2DOpenCLKernel::BuildKernel() { kernel_name << "_Img"; } ocl_runtime_->LoadSource(program_name, GetActDefines() + conv2d_source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name.str()); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name.str(), build_options_ext); } void Conv2DOpenCLKernel::SetBlockSize() { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc index 2135107445..5cd279da16 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc @@ -65,7 +65,13 @@ int Conv2dTransposeOpenCLKernel::Prepare() { std::string source = GetActDefines() + conv2d_transpose_source; std::string program_name = "conv2d_transpose"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); #endif auto ret = InitWeights(); if (ret != RET_OK) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc index 19c5c32166..3d5ca89a12 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc @@ -85,7 +85,13 @@ int DepthwiseConv2dOpenCLKernel::Prepare() { std::string program_name = "DepthwiseConv2d"; std::string source = depthwise_conv2d_source; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); #endif auto ret = InitWeights(); if (ret != RET_OK) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc index 227b343d29..a4f80e0605 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc @@ -105,7 +105,13 @@ int FullConnectionOpenCLKernel::Prepare() { std::string source = fullconnection_source; std::string program_name = "FullConnection"; ocl_runtime_->LoadSource(program_name, GetActDefines() + source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); #endif auto ret = InitWeights(); if (ret != RET_OK) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc index 64c4accce7..0b9d2f0d9c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fusion_eltwise.cc @@ -147,7 +147,13 @@ int FusionEltwiseOpenCLKernel::Prepare() { std::string program_name = "FusionEltwise\n" + source; std::string kernel_name = "FusionEltwise"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (ocl_runtime_->GetFp16Enable()) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } else { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); InitWeights(); SetGlobalLocal(); SetConstArgs(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc index 07e898e68d..664e1a7901 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc @@ -106,11 +106,6 @@ void GatherOpenCLKernel::SetGlobalLocal() { int GatherOpenCLKernel::Prepare() { std::string kernel_name = "gather"; - if (desc_.data_type == kNumberTypeInt32) { - kernel_name += "_int"; - } else { - kernel_name += "_float"; - } if (in_tensors_.at(0)->shape().size() == 1 && axis_ == 0) { axis_ = 3; } @@ -119,7 +114,15 @@ int GatherOpenCLKernel::Prepare() { #else std::string program_name = "gather"; ocl_runtime_->LoadSource(program_name, gather_source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, {}); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeInt32) { + build_options_ext = {" -DTYPE=int -DTYPE4=int4 -DWRITE_IMAGE=write_imagei -DREAD_IMAGE=read_imagei "}; + } else if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DTYPE=float -DTYPE4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DTYPE=half -DTYPE4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); #endif if (in_tensors_.at(1)->IsConst()) { intensor1_is_tensor = false; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc index 1f769350a2..4a3f991258 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/layer_norm.cc @@ -171,9 +171,15 @@ int LayerNormOpenCLKernel::Prepare() { std::string source = layer_norm_source; std::string program_name = "LayerNormalization"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); kernel_name_mean_var += "Axis" + std::to_string(normalized_axis_) + "NHWC4"; - ocl_runtime_->BuildKernel(kernel_mean_var_, program_name, kernel_name_mean_var); + ocl_runtime_->BuildKernel(kernel_mean_var_, program_name, kernel_name_mean_var, build_options_ext); MS_LOG(DEBUG) << kernel_name << " Init Done!"; SetConstArgs(); SetGlobalLocal(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc index 0aa61b52b0..b6576745d2 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc @@ -92,7 +92,13 @@ int MatMulOpenCLKernel::Prepare() { std::string source = matmul_source; std::string program_name = "MatMul"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); #endif SetConstArgs(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc index 23fef86cf1..8ecc91722e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc @@ -51,10 +51,17 @@ int OneHotOpenCLKernel::Prepare() { #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else + std::string source = one_hot_source; std::string program_name = "OneHot"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, {}); + std::vector build_options_ext; + if (ocl_runtime_->GetFp16Enable()) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=write_imagei "}; + } else { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagei "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); #endif InitWeights(); SetConstArgs(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc index 0ab92e2b1e..fc698cc9c5 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc @@ -74,7 +74,13 @@ int PadOpenCLKernel::Prepare() { const std::string source = pad_source; const std::string program_name = "Pad"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, "Pad"); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, "Pad", build_options_ext); SetConstArgs(); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc index e38b4e0b2f..cb7f6cc907 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc @@ -82,7 +82,13 @@ int PoolingOpenCLKernel::Prepare() { std::string source = pooling2d_source; std::string program_name = "Pooling2d"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); #endif SetConstArgs(); SetGlobalLocal(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc index a17dbea353..6ded9d93ad 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.cc @@ -118,7 +118,13 @@ int PowerOpenCLKernel::Prepare() { scale_ = param->scale_; shift_ = param->shift_; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); MS_LOG(DEBUG) << kernel_name << " Init Done!"; SetGlobalLocal(); SetConstArgs(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc index 9d914db12d..533c2e5ce0 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc @@ -130,7 +130,13 @@ int PReluOpenCLKernel::Prepare() { std::string program_name = "PRelu"; std::string kernel_name = "PRelu_" + std::string(weight_is_scalar ? "scalar" : "vector"); ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); InitWeights(); MS_LOG(DEBUG) << program_name << " init Done!"; MS_LOG(DEBUG) << "kernel_name=: " << kernel_name << " init Done!"; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc index 4953a22727..7e5ee9de50 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc @@ -186,7 +186,13 @@ int ReduceOpenCLKernel::Prepare() { std::string source = reduce_source; std::string program_name = "Reduce"; ocl_runtime_->LoadSource(program_name, source); - auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + auto ret = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); if (ret != RET_OK) { return ret; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc index e7e88e6bb5..deed162f5b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc @@ -73,18 +73,21 @@ void ReshapeOpenCLKernel::SetGlobalLocal() { int ReshapeOpenCLKernel::Prepare() { std::string kernel_name = "reshape_NHWC4"; - if (desc_.data_type == kNumberTypeInt32) { - kernel_name += "_int"; - } else { - kernel_name += "_float"; - } #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else std::string source = reshape_source; std::string program_name = "reshape"; + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeInt32) { + build_options_ext = {" -DTYPE=int -DTYPE4=int4 -DWRITE_IMAGE=write_imagei -DREAD_IMAGE=read_imagei "}; + } else if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DTYPE=float -DTYPE4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DTYPE=half -DTYPE4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, {}); + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); #endif SetGlobalLocal(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc index ecb6bc3bef..551f479d66 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc @@ -70,7 +70,13 @@ int ResizeOpenCLKernel::Prepare() { std::string source = resize_source; std::string program_name = "Resize"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); #endif SetConstArgs(); SetGlobalLocal(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc index 71f0006b5f..26b725da49 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc @@ -166,7 +166,13 @@ int ScaleOpenCLKernel::Prepare() { std::string program_name = "Scale"; std::string source = GetActDefines() + scale_source; ocl_runtime_->LoadSource(program_name, source); - error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); #endif if (error_code != RET_OK) { return error_code; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc index 0e14de1614..433b177fa6 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc @@ -84,13 +84,15 @@ int SoftmaxOpenCLKernel::Prepare() { #else std::string program_name = "Softmax"; ocl_runtime_->LoadSource(program_name, source); - std::vector ext_build_opt; - if (out_tensors_[0]->data_type() == kNumberTypeFloat32) { - ext_build_opt.push_back("-DOUT_FLT4=convert_float4 -DWRITE_IMAGEOUT=write_imagef"); - } else { - ext_build_opt.push_back("-DOUT_FLT4=convert_half4 -DWRITE_IMAGEOUT=write_imageh"); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = { + " -DOUT_FLT4=convert_float4 -DWRITE_IMAGEOUT=write_imagef -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = { + " -DOUT_FLT4=convert_half4 -DWRITE_IMAGEOUT=write_imageh -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; } - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt); + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); #endif SetConstArgs(); SetGlobalLocal(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc index a75c64c51e..7ace8d03c3 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc @@ -100,7 +100,13 @@ int SpaceToBatchNDOpenCLKernel::Prepare() { std::string source = space_to_batch_nd_source; std::string program_name = "space_to_batch_nd"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); #endif SetGlobalLocal(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.cc index b301ae046f..1d6f297b87 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.cc @@ -58,7 +58,13 @@ int SpaceToDepthOpenCLKernel::Prepare() { std::string source = space_to_depth_source; std::string program_name = "SpaceToDepth"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); #endif SetConstArgs(); SetGlobalLocal(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc index 8273b75550..471b1ca6be 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc @@ -148,7 +148,13 @@ int SparseToDenseOpenCLKernel::Prepare() { std::string source = sparse_to_dense_source; std::string program_name = "SparseToDense"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); if (in_tensors_.size() > 3) { auto input_tensor3 = in_tensors_[3]; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/split.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/split.cc index 634bc940fb..ca5d87f8e6 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/split.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/split.cc @@ -136,7 +136,13 @@ int SplitOpenCLKernel::Prepare() { std::string source = split_source; std::string program_name = "split"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); MS_LOG(DEBUG) << kernel_name << " Init Done!"; SetConstArgs(); SetGlobalLocal(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc index 973d812ab6..a64ba3786f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc @@ -165,7 +165,13 @@ int StackOpenCLKernel::Prepare() { std::string source = stack_source; std::string program_name = "stack"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); SetConstArgs(); SetGlobalLocal(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc index 74bc4c40a4..aa00b9a019 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strassen.cc @@ -32,12 +32,18 @@ int StrassenOpenCLKernel::Prepare() { std::string source = strassen_source; std::string program_name = "MatMul"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); - ocl_runtime_->BuildKernel(kernel_IMG_add_sub_2, program_name, "MatMul_IMG_Add_Sub_2"); - ocl_runtime_->BuildKernel(kernel_BUF_add_sub_2, program_name, "MatMul_BUF_Add_Sub_2"); - ocl_runtime_->BuildKernel(kernel_back_result, program_name, "Strassen_Back_Result"); - ocl_runtime_->BuildKernel(MatMul_StrassenBUFFilled, program_name, "MatMul_BUF_Filled"); - ocl_runtime_->BuildKernel(MatMul_StrassenIMGFilled, program_name, "MatMul_IMG_Filled"); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); + ocl_runtime_->BuildKernel(kernel_IMG_add_sub_2, program_name, "MatMul_IMG_Add_Sub_2", build_options_ext); + ocl_runtime_->BuildKernel(kernel_BUF_add_sub_2, program_name, "MatMul_BUF_Add_Sub_2", build_options_ext); + ocl_runtime_->BuildKernel(kernel_back_result, program_name, "Strassen_Back_Result", build_options_ext); + ocl_runtime_->BuildKernel(MatMul_StrassenBUFFilled, program_name, "MatMul_BUF_Filled", build_options_ext); + ocl_runtime_->BuildKernel(MatMul_StrassenIMGFilled, program_name, "MatMul_IMG_Filled", build_options_ext); auto ret = InitWeights(); if (ret != RET_OK) { return ret; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc index e05db42848..0578f7eae3 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc @@ -89,7 +89,13 @@ int StridedSliceOpenCLKernel::CheckSpecs() { int StridedSliceOpenCLKernel::Prepare() { std::string program_name = "strided_slice"; ocl_runtime_->LoadSource(program_name, strided_slice_source); - ocl_runtime_->BuildKernel(kernel_, program_name, "strided_slice"); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, "strided_slice", build_options_ext); SetConstArgs(); SetGlobalLocal(); return RET_OK; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc index ea39694db7..940645fb8c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc @@ -84,21 +84,19 @@ int TransposeOpenCLKernel::Prepare() { perm_4d_[2] = 2; perm_4d_[3] = 3; } + std::string kernel_name = "transpose"; + if (tensor_size_.N == 1 && perm_4d_[0] == 0 && perm_4d_[1] == 3 && perm_4d_[2] == 1 && perm_4d_[3] == 2) { type_ = TransposeType::AXIS0312; + kernel_name += "_0312"; } else if (tensor_size_.N == 1 && perm_4d_[0] == 0 && perm_4d_[1] == 2 && perm_4d_[2] == 3 && perm_4d_[3] == 1) { type_ = TransposeType::AXIS0231; - } else { - type_ = TransposeType::GENERAL; - } - std::string kernel_name = "transpose"; - if (type_ == TransposeType::AXIS0312) { - kernel_name += "_0312"; - } else if (type_ == TransposeType::AXIS0231) { kernel_name += "_0231"; } else { + type_ = TransposeType::GENERAL; kernel_name += "_general"; } + if (in_tensors_[0]->shape().size() == 4 && in_tensors_[0]->shape()[2] * UP_DIV(in_tensors_[0]->shape()[3], C4NUM) > ocl_runtime_->GetMaxImage2DWidth()) { // just for input @@ -112,7 +110,13 @@ int TransposeOpenCLKernel::Prepare() { std::string source = transpose_source; std::string program_name = "transpose"; ocl_runtime_->LoadSource(program_name, source); - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options_ext); #endif SetConstArgs(); SetGlobalLocal(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc index 92729bc775..138aefe9cf 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/winograd.cc @@ -82,10 +82,16 @@ std::vector GenerateWinogradFilter(void *src, TypeId dtype, size_t CO, si void WinogradOpenCLKernel::BuildKernel() { std::string program_name = "winograd"; ocl_runtime_->LoadSource(program_name, GetActDefines() + winograd_source); - ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36"); + std::vector build_options_ext; + if (desc_.data_type == kNumberTypeFloat32) { + build_options_ext = {" -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "}; + } else if (desc_.data_type == kNumberTypeFloat16) { + build_options_ext = {" -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "}; + } + ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options_ext); ocl_runtime_->BuildKernel(kernel_, program_name, - filter_type_ == MemType::IMG ? "WinogradConv2D_Img" : "WinogradConv2D"); - ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4"); + filter_type_ == MemType::IMG ? "WinogradConv2D_Img" : "WinogradConv2D", build_options_ext); + ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options_ext); } void WinogradOpenCLKernel::InitFilter() { diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.cc b/mindspore/lite/src/runtime/kernel/opencl/utils.cc index 7f6fa1f87a..31c3842456 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc @@ -234,19 +234,24 @@ int GetBroadcastGpuAxis(int ndim, int ori_axis) { return axis; } -void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor) { +void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor, + int data_type) { MS_ASSERT(src); MS_ASSERT(dst); auto src_fp16 = reinterpret_cast(src); auto src_fp32 = reinterpret_cast(src); + auto src_int32 = reinterpret_cast(src); auto dst_fp16 = reinterpret_cast(dst); auto dst_fp32 = reinterpret_cast(dst); + auto dst_int32 = reinterpret_cast(dst); for (int n = 0, src_idx = 0; n < tensor.N; n++) { for (int h = 0; h < tensor.H; ++h) { for (int w = 0; w < tensor.W; ++w) { for (int c = 0; c < tensor.C; ++c, ++src_idx) { int dst_idx = ((n * tensor.H + h) * tensor.W + w) * tensor.Slice * C4NUM + c; - if (dst_is_fp16) { + if (data_type == kNumberTypeInt32) { + dst_int32[dst_idx] = src_int32[src_idx]; + } else if (dst_is_fp16) { dst_fp16[dst_idx] = src_is_fp16 ? src_fp16[src_idx] : static_cast(src_fp32[src_idx]); } else { dst_fp32[dst_idx] = src_is_fp16 ? static_cast(src_fp16[src_idx]) : src_fp32[src_idx]; diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h index e7bbd275f2..254a61a7de 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.h +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h @@ -58,7 +58,8 @@ int WriteToBin(const std::string &file_path, void *data, size_t size); int GetBroadcastGpuAxis(int ndim, int ori_axis); -void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor); +void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor, + int data_type = kNumberTypeFloat32); int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tensor_name, lite::Tensor *tensor, TypeId expect_data_type, const std::vector &expect_shape); diff --git a/mindspore/lite/src/scheduler.cc b/mindspore/lite/src/scheduler.cc index 3f1d736e17..7e05d6a095 100644 --- a/mindspore/lite/src/scheduler.cc +++ b/mindspore/lite/src/scheduler.cc @@ -394,7 +394,7 @@ kernel::LiteKernel *Scheduler::FindGpuKernel(const std::vector &in_ten if (context_->IsGpuEnabled()) { // support more data type like int32 kernel::KernelKey gpu_desc{kGPU, desc.data_type, desc.type}; - if (context_->IsGpuFloat16Enabled()) { + if (desc.data_type == kNumberTypeFloat32 && context_->IsGpuFloat16Enabled()) { gpu_desc.data_type = kNumberTypeFloat16; } diff --git a/mindspore/lite/test/models_gpu_fp16.cfg b/mindspore/lite/test/models_gpu_fp16.cfg index dfa4cc9234..ab14a99f2b 100644 --- a/mindspore/lite/test/models_gpu_fp16.cfg +++ b/mindspore/lite/test/models_gpu_fp16.cfg @@ -9,3 +9,4 @@ mtk_model_emotions_0727_nosoftmax.tflite landmark PoseNet_dla_17_x512_tmp plat_isface +ml_location_lane_counter.onnx;5.5 diff --git a/mindspore/lite/test/models_gpu_fp32.cfg b/mindspore/lite/test/models_gpu_fp32.cfg index 2f055edbbd..2382d6091c 100644 --- a/mindspore/lite/test/models_gpu_fp32.cfg +++ b/mindspore/lite/test/models_gpu_fp32.cfg @@ -57,3 +57,45 @@ mtk_model_face_dress.pb;0.5;1;1,128,128,3 hiai_model_normalize_object_scene_ps_20200519.pb;0.5;1;1,224,224,3 hiai_label_and_video.pb;0.5;1;1,224,224,3 tinyyolov2-8.onnx;0.5;1;1,416,416,3 +mtk_detect-mbv2-shortcut-400-400-simplified.onnx +emotion-ferplus-8.onnx +rcnn-ilsvrc13-9.onnx +shufflenet-v2-10.onnx +squeezenet1.1-7.onnx +ml_table_detection_fp32_tmp.onnx +ml_table_segment.onnx +googlenet-9.onnx +inception-v1-9.onnx +shufflenet-9.onnx +ml_face_3d.onnx +gts_version-RFB-320_simplified.onnx +mnist-8.onnx +ml_video_edit_judge.onnx +ml_video_edit_vignet.onnx +hdc_mobilenet_1w_class.onnx +ml_video_edit_imitate_filter.onnx +ml_edu_kit_hand_detection.onnx +ml_edu_kit_hand_key_position.onnx +mtk_detect-deeper-halfdeeper-mbv1-shortcut-400-400_nopostprocess_simplified_onnx.onnx +mtk_detect-mbv1-shortcut-400-400_nopostprocess_simplified_onnx.onnx +mtk_detect-deeper-halfdeeper-mbv1-lastearlySSD-shortcut-400-400_nopostprocess_simplified_onnx.onnx +ml_2012_ocr_detection_tmp.onnx +ml_video_edit_enhance_update_tmp.onnx +bloom_hongmo_detection_tmp.onnx +Q_face_recognition.onnx +Q888_iris_detect.onnx +ml_ocr_bank_card_detection_inception_tmp +ml_ocr_detect_20200305 +Q_iMaxDN_RGB_385_p_RGB_RGB_pb2tflite.tflite +Q_iMaxSR_RGB_385_p_pb2tflite.tflite +mtk_age_gender.pb +mtk_model_ckpt.pb +Q_inception-249970-672-11-16.pb +Q_crnn_screen_slim400w_more_20w.pb +hiai_ssd_mobilenetv2_object.pb +hiai_humanDetection.pb +mtk_face_features_v1.pb +Q_crnn_ori_75w_slim_norm.pb +Q_crnn_ori_v2_405001_notrans_nopre.pb +bolt_segment.pb +ml_location_lane_counter.onnx;2 diff --git a/mindspore/lite/test/run_benchmark_nets.sh b/mindspore/lite/test/run_benchmark_nets.sh index bdcd2a3562..b61cbe4407 100644 --- a/mindspore/lite/test/run_benchmark_nets.sh +++ b/mindspore/lite/test/run_benchmark_nets.sh @@ -2123,15 +2123,17 @@ function Run_gpu() { input_files=$input_files${data_path}'input/'$model_name'.ms.bin_'$i',' done fi + if [[ ${accuracy_limit} == "" ]]; then + accuracy_limit="0.5" + fi echo ${model_name} >> "${run_gpu_log_file}" echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt if [[ $input_shapes == "" ]]; then - echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}" - echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}" + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt else echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}" echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --device=GPU --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt - fi adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_gpu_log_file}" @@ -2148,17 +2150,39 @@ function Run_gpu() { if [[ $model_name == \#* ]]; then continue fi + model_name=`echo ${line} | awk -F ';' '{print $1}'` + accuracy_limit=`echo ${line} | awk -F ';' '{print $2}'` + input_num=`echo ${line} | awk -F ';' '{print $3}'` + input_shapes=`echo ${line} | awk -F ';' '{print $4}'` + input_files="" + data_path="/data/local/tmp/input_output/" + output_file=${data_path}'output/'${model_name}'.ms.out' + if [[ ${input_num} == "" || ${input_num} == 1 ]]; then + input_files=/data/local/tmp/input_output/input/${model_name}.ms.bin + else + for i in $(seq 1 $input_num) + do + input_files=$input_files${data_path}'input/'$model_name'.ms.bin_'$i',' + done + fi + if [[ ${accuracy_limit} == "" ]]; then + accuracy_limit="5" + fi echo ${model_name} >> "${run_gpu_log_file}" echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt - echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/data/local/tmp/input_output/output/'${model_name}'.ms.out --enableFp16=true --accuracyThreshold=5' >> "${run_gpu_log_file}" - echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'.ms --inDataFile=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/data/local/tmp/input_output/output/'${model_name}'.ms.out --enableFp16=true --accuracyThreshold=5' >> adb_run_cmd.txt + if [[ $input_shapes == "" ]]; then + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --enableFp16=true --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}" + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --enableFp16=true --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt + else + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --enableFp16=true --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}" + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --enableFp16=true --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt + fi adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_gpu_log_file}" if [ $? = 0 ]; then run_result='arm64_gpu_fp16: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file} else run_result='arm64_gpu_fp16: '${model_name}' failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 fi - #sleep 1 done < ${models_gpu_fp16_config} # Run GPU weightquant converted models: @@ -2167,17 +2191,39 @@ function Run_gpu() { if [[ $model_name == \#* ]]; then continue fi + model_name=`echo ${line} | awk -F ';' '{print $1}'` + accuracy_limit=`echo ${line} | awk -F ';' '{print $2}'` + input_num=`echo ${line} | awk -F ';' '{print $3}'` + input_shapes=`echo ${line} | awk -F ';' '{print $4}'` + input_files="" + data_path="/data/local/tmp/input_output/" + output_file=${data_path}'output/'${model_name}'.ms.out' + if [[ ${input_num} == "" || ${input_num} == 1 ]]; then + input_files=/data/local/tmp/input_output/input/${model_name}.ms.bin + else + for i in $(seq 1 $input_num) + do + input_files=$input_files${data_path}'input/'$model_name'.ms.bin_'$i',' + done + fi + if [[ ${accuracy_limit} == "" ]]; then + accuracy_limit="5" + fi echo ${model_name} >> "${run_gpu_log_file}" echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt - echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'_weightquant.ms --inDataFile=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/data/local/tmp/input_output/output/'${model_name}'.ms.out --enableFp16=true --accuracyThreshold=5' >> "${run_gpu_log_file}" - echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --modelFile='${model_name}'_weightquant.ms --inDataFile=/data/local/tmp/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/data/local/tmp/input_output/output/'${model_name}'.ms.out --enableFp16=true --accuracyThreshold=5' >> adb_run_cmd.txt + if [[ $input_shapes == "" ]]; then + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --enableFp16=true --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}" + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --enableFp16=true --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt + else + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --enableFp16=true --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> "${run_gpu_log_file}" + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test;./benchmark --device=GPU --enableFp16=true --inputShapes='${input_shapes}' --accuracyThreshold='${accuracy_limit}' --modelFile='${model_name}'.ms --inDataFile='${input_files}' --benchmarkDataFile='${output_file} >> adb_run_cmd.txt + fi adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_gpu_log_file}" if [ $? = 0 ]; then run_result='arm64_gpu_weightquant: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_result_file} else run_result='arm64_gpu_weightquant: '${model_name}' failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1 fi - #sleep 1 done < ${models_gpu_weightquant_config} }