diff --git a/mindspore/lite/src/runtime/gpu/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/gpu/opencl/opencl_allocator.cc index fe4dd8e1c3..52fa90196b 100644 --- a/mindspore/lite/src/runtime/gpu/opencl/opencl_allocator.cc +++ b/mindspore/lite/src/runtime/gpu/opencl/opencl_allocator.cc @@ -68,12 +68,14 @@ void *OpenCLAllocator::MinimumFit(MemType mem_type, size_t size, const ImageSize void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer) { cl_int ret = CL_SUCCESS; MS_ASSERT(buffer); + MS_ASSERT(size > 0); *buffer = new (std::nothrow) cl::Buffer(*ocl_runtime_->Context(), static_cast(flags), size, data, &ret); if (*buffer == nullptr) { MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")"; return nullptr; } void *host_ptr = ocl_runtime_->MapBuffer(**buffer, CL_MAP_READ | CL_MAP_WRITE, size); + MS_ASSERT(host_ptr); if (host_ptr == nullptr) { delete *buffer; MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << *buffer << ", host_ptr=" << host_ptr; diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl index 976cfc4201..39e4e61e43 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl @@ -1,450 +1,80 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void to_format_NHWC_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - FLT4 data = (FLT4)(0.f); - int offset = (X * shape.z + Y) * shape.w + Z * 4; - __global float *src_addr = (__global float *)src_data; - src_addr += offset; - if ((Z + 1) * 4 <= shape.w) { - data = TO_FLT4(((__global float4 *)src_addr)[0]); - } else { - if ((shape.w - Z * 4) >= 1) { - data.x = (FLT)src_addr[0]; - } - if ((shape.w - Z * 4) >= 2) { - data.y = (FLT)src_addr[1]; - } - if ((shape.w - Z * 4) >= 3) { - data.z = (FLT)src_addr[2]; - } - } - if (size.y * size.z <= MAX_IMAGE2D_WIDTH) - WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); - else - WRITE_IMAGE(dst_data, (int2)(Z, X * size.y + Y), data); -} -__kernel void to_format_NHWC_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - FLT4 data = (FLT4)(0.f); - int offset = (X * shape.z + Y) * shape.w + Z * 4; - __global half *src_addr = (__global half *)src_data; - src_addr += offset; - if ((Z + 1) * 4 <= shape.w) { - data = TO_FLT4(((__global half4 *)src_addr)[0]); - } else { - if ((shape.w - Z * 4) >= 1) { - data.x = (FLT)src_addr[0]; - } - if ((shape.w - Z * 4) >= 2) { - data.y = (FLT)src_addr[1]; - } - if ((shape.w - Z * 4) >= 3) { - data.z = (FLT)src_addr[2]; - } - } - if (size.y * size.z <= MAX_IMAGE2D_WIDTH) - WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); - else - WRITE_IMAGE(dst_data, (int2)(Z, X * size.y + Y), data); -} -__kernel void to_format_NCHW_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - FLT4 data = (FLT4)(0.f); - __global float *src_addr = (__global float *)src_data; - __global float *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; - __global float *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; - __global float *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; - if ((Z + 1) * 4 <= shape.w) { - data = TO_FLT4(((__global float4 *)src_addr_0)[0]); - } else { - if ((shape.w - Z * 4) >= 1) { - data.x = src_addr_0[0]; - } - if ((shape.w - Z * 4) >= 2) { - data.y = src_addr_1[0]; - } - if ((shape.w - Z * 4) >= 3) { - data.z = src_addr_2[0]; - } - } - WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); -} -__kernel void to_format_NCHW_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - FLT4 data = (FLT4)(0.f); - __global half *src_addr = (__global half *)src_data; - __global half *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; - __global half *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; - __global half *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; - if ((Z + 1) * 4 <= shape.w) { - data = TO_FLT4(((__global half4 *)src_addr_0)[0]); - } else { - if ((shape.w - Z * 4) >= 1) { - data.x = src_addr_0[0]; - } - if ((shape.w - Z * 4) >= 2) { - data.y = src_addr_1[0]; - } - if ((shape.w - Z * 4) >= 3) { - data.z = src_addr_2[0]; - } - } - WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); -} -__kernel void to_format_NHWC_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z || shape.y == 0) { - return; - } - int offset = (X / shape.y) * shape.y * shape.z * shape.w + ((X % shape.y) * shape.z + Y) * shape.w + Z * 4; - __global float *src_addr = (__global float *)src_data; - src_addr += offset; - FLT4 data = (FLT4)(0.f); - if ((Z + 1) * 4 <= shape.w) { - data = TO_FLT4(((__global float4 *)src_addr)[0]); - } else { - if ((shape.w - Z * 4) >= 1) { - data.x = (FLT)src_addr[0]; - } - if ((shape.w - Z * 4) >= 2) { - data.y = (FLT)src_addr[1]; - } - if ((shape.w - Z * 4) >= 3) { - data.z = (FLT)src_addr[2]; - } - } - int pos_ix = (X / shape.y) * size.z * shape.y + Z * shape.y + X % shape.y; - WRITE_IMAGE(dst_data, (int2)(Y, pos_ix), data); -} -__kernel void to_format_NHWC_to_NC4HW4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z || shape.y == 0) { - return; - } - int offset = (X / shape.y) * shape.y * shape.z * shape.w + ((X % shape.y) * shape.z + Y) * shape.w + Z * 4; - __global half *src_addr = (__global half *)src_data; - src_addr += offset; - FLT4 data = (FLT4)(0.f); - if ((Z + 1) * 4 <= shape.w) { - data = TO_FLT4(((__global half4 *)src_addr)[0]); - } else { - if ((shape.w - Z * 4) >= 1) { - data.x = (FLT)src_addr[0]; - } - if ((shape.w - Z * 4) >= 2) { - data.y = (FLT)src_addr[1]; - } - if ((shape.w - Z * 4) >= 3) { - data.z = (FLT)src_addr[2]; - } - } - int pos_ix = (X / shape.y) * size.z * shape.y + Z * shape.y + X % shape.y; - WRITE_IMAGE(dst_data, (int2)(Y, pos_ix), data); -} -__kernel void to_format_NHWC4_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), TO_FLT4(src_data[(X * size.y + Y) * size.z + Z])); -} -__kernel void to_format_NHWC4_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), TO_FLT4(src_data[(X * size.y + Y) * size.z + Z])); -} -__kernel void to_format_NC4HW4_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, - int4 size, int4 shape) { - // size(h, w, c4, 1), shape(n, c, h, w) - int X = get_global_id(0); // h - int Y = get_global_id(1); // w - int Z = get_global_id(2); // c4 - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), TO_FLT4(src_data[(Z * size.x + X) * size.y + Y])); -} -__kernel void to_format_NC4HW4_to_NC4HW4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, - int4 shape) { - // size(h, w, c4, 1), shape(n, c, h, w) - int X = get_global_id(0); // h - int Y = get_global_id(1); // w - int Z = get_global_id(2); // c4 - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), TO_FLT4(src_data[(Z * size.x + X) * size.y + Y])); -} -__kernel void to_format_NCHW_to_NCHW_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - dst_data[(Z * size.y + Y) * size.x + X] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z))); -} -__kernel void to_format_NCHW_to_NCHW_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - dst_data[(Z * size.y + Y) * size.x + X] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z))); -} -__kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - float4 data; - if (size.y * size.z <= MAX_IMAGE2D_WIDTH) - data = convert_float4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X))); - else - data = convert_float4(READ_IMAGEIN(src_data, smp_zero, (int2)(Z, X * size.y + Y))); - int offset = (X * shape.z + Y) * shape.w + Z * 4; - __global float *dst_addr = (__global float *)dst_data; - dst_addr += offset; - if ((Z + 1) * 4 <= shape.w) { - ((__global float4 *)dst_addr)[0] = data; - } else { - if (shape.w - Z * 4 >= 1) { - dst_addr[0] = data.x; - } - if (shape.w - Z * 4 >= 2) { - dst_addr[1] = data.y; - } - if (shape.w - Z * 4 >= 3) { - dst_addr[2] = data.z; - } - } -} -__kernel void to_format_NHWC4_to_NCHW_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); - int offset = (X * shape.z + Y) * shape.w + Z * 4; - __global float *dst_addr = (__global float *)dst_data; - __global float *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; - __global float *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; - __global float *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; - dst_addr += offset; - if ((Z + 1) * 4 <= shape.w) { - ((__global float4 *)dst_addr_0)[0] = data; - } else { - if (shape.w - Z * 4 >= 1) { - dst_addr_0[0] = data.x; - } - if (shape.w - Z * 4 >= 2) { - dst_addr_1[0] = data.y; - } - if (shape.w - Z * 4 >= 3) { - dst_addr_2[0] = data.z; - } - } -} -__kernel void to_format_NHWC4_to_NCHW_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); - int offset = (X * shape.z + Y) * shape.w + Z * 4; - __global half *dst_addr = (__global half *)dst_data; - __global half *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; - __global half *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; - __global half *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; - dst_addr += offset; - if ((Z + 1) * 4 <= shape.w) { - ((__global half4 *)dst_addr_0)[0] = data; - } else { - if (shape.w - Z * 4 >= 1) { - dst_addr_0[0] = data.x; - } - if (shape.w - Z * 4 >= 2) { - dst_addr_1[0] = data.y; - } - if (shape.w - Z * 4 >= 3) { - dst_addr_2[0] = data.z; - } - } -} -__kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - half4 data; - if (size.y * size.z <= MAX_IMAGE2D_WIDTH) - data = convert_half4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X))); - else - data = convert_half4(READ_IMAGEIN(src_data, smp_zero, (int2)(Z, X * size.y + Y))); - int offset = (X * shape.z + Y) * shape.w + Z * 4; - __global half *dst_addr = (__global half *)dst_data; - dst_addr += offset; - if ((Z + 1) * 4 <= shape.w) { - ((__global half4 *)dst_addr)[0] = data; - } else { - if (shape.w - Z * 4 >= 1) { - dst_addr[0] = data.x; - } - if (shape.w - Z * 4 >= 2) { - dst_addr[1] = data.y; - } - if (shape.w - Z * 4 >= 3) { - dst_addr[2] = data.z; - } - } -} -__kernel void to_format_NC4HW4_to_NHWC_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z || shape.y == 0) { - return; - } - int pos_ix = (X / shape.y) * size.z * shape.y + Z * shape.y + X % shape.y; - float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y, pos_ix))); - int offset = (X / shape.y) * shape.y * shape.z * shape.w + ((X % shape.y) * shape.z + Y) * shape.w + Z * 4; - __global float *dst_addr = (__global float *)dst_data; - dst_addr += offset; - if ((Z + 1) * 4 <= shape.w) { - ((__global float4 *)dst_addr)[0] = data; - } else { - if (shape.w - Z * 4 >= 1) { - dst_addr[0] = data.x; - } - if (shape.w - Z * 4 >= 2) { - dst_addr[1] = data.y; - } - if (shape.w - Z * 4 >= 3) { - dst_addr[2] = data.z; - } - } -} -__kernel void to_format_NC4HW4_to_NHWC_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z || shape.y == 0) { - return; - } - int pos_ix = (X / shape.y) * size.z * shape.y + Z * shape.y + X % shape.y; - half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y, pos_ix))); - int offset = (X / shape.y) * shape.y * shape.z * shape.w + ((X % shape.y) * shape.z + Y) * shape.w + Z * 4; - __global half *dst_addr = (__global half *)dst_data; - dst_addr += offset; - if ((Z + 1) * 4 <= shape.w) { - ((__global half4 *)dst_addr)[0] = data; - } else { - if (shape.w - Z * 4 >= 1) { - dst_addr[0] = data.x; - } - if (shape.w - Z * 4 >= 2) { - dst_addr[1] = data.y; - } - if (shape.w - Z * 4 >= 3) { - dst_addr[2] = data.z; - } - } -} -__kernel void to_format_NC4HW4_to_NC4HW4_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, - int4 shape) { - // size(h, w, c, 1), shape(n, c, h, w) - int X = get_global_id(0); // h - int Y = get_global_id(1); // w - int Z = get_global_id(2); // c - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - dst_data[(Z * size.x + X) * size.y + Y] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X))); -} -__kernel void to_format_NC4HW4_to_NC4HW4_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, - int4 shape) { - // size(h, w, c, 1), shape(n, c, h, w) - int X = get_global_id(0); // h - int Y = get_global_id(1); // w - int Z = get_global_id(2); // c - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - dst_data[(Z * size.x + X) * size.y + Y] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X))); -} -__kernel void to_format_NHWC4_to_NHWC4_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - dst_data[(X * size.y + Y) * size.z + Z] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); -} -__kernel void to_format_NHWC4_to_NHWC4_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, - int4 shape) { - int X = get_global_id(0); - int Y = get_global_id(1); - int Z = get_global_id(2); - if (X >= size.x || Y >= size.y || Z >= size.z) { - return; - } - dst_data[(X * size.y + Y) * size.z + Z] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); -} + +#define BUF_to_IMG(src_dtype, dst_dtype, SRC_FLT, DST_FLT, WRITE_IMAGE_OUT) \ + __kernel void BUF_to_IMG_##src_dtype##_##dst_dtype(__global float4 *src_data, __write_only image2d_t dst_data, \ + int4 size, int4 shape) { \ + int X = get_global_id(0); \ + int Y = get_global_id(1); \ + int Z = get_global_id(2); \ + if (X >= size.x || Y >= size.y || Z >= size.z) { \ + return; \ + } \ + DST_FLT##4 data = (DST_FLT##4)(0.f); \ + int offset = (X * shape.z + Y) * shape.w + Z * 4; \ + __global SRC_FLT *src_addr = (__global SRC_FLT *)src_data; \ + src_addr += offset; \ + if ((Z + 1) * 4 <= shape.w) { \ + data = convert_##DST_FLT##4(((__global SRC_FLT##4 *)src_addr)[0]); \ + } else { \ + if ((shape.w - Z * 4) >= 1) { \ + data.x = (DST_FLT)src_addr[0]; \ + } \ + if ((shape.w - Z * 4) >= 2) { \ + data.y = (DST_FLT)src_addr[1]; \ + } \ + if ((shape.w - Z * 4) >= 3) { \ + data.z = (DST_FLT)src_addr[2]; \ + } \ + } \ + if (size.y * size.z <= MAX_IMAGE2D_WIDTH) \ + WRITE_IMAGE_OUT(dst_data, (int2)(Y * size.z + Z, X), data); \ + else \ + WRITE_IMAGE_OUT(dst_data, (int2)(Z, X * size.y + Y), data); \ + } + +// BUF_to_IMG(src_dtype, dst_dtype, SRC_FLT, DST_FLT, WRITE_IMAGE_OUT) +BUF_to_IMG(float32, float32, float, float, write_imagef); +BUF_to_IMG(float32, float16, float, half, write_imageh); +BUF_to_IMG(float16, float16, half, half, write_imageh); +BUF_to_IMG(int32, int32, float, float, write_imagef); +BUF_to_IMG(uint32, uint32, float, float, write_imagef); + +#define IMG_to_BUF(src_dtype, dst_dtype, SRC_FLT, DST_FLT, READ_IMAGE_IN) \ + __kernel void IMG_to_BUF_##src_dtype##_##dst_dtype(__read_only image2d_t src_data, __global DST_FLT##4 * dst_data, \ + int4 size, int4 shape) { \ + int X = get_global_id(0); \ + int Y = get_global_id(1); \ + int Z = get_global_id(2); \ + if (X >= size.x || Y >= size.y || Z >= size.z) { \ + return; \ + } \ + DST_FLT##4 data; \ + if (size.y * size.z <= MAX_IMAGE2D_WIDTH) \ + data = convert_##DST_FLT##4(READ_IMAGE_IN(src_data, smp_zero, (int2)(Y * size.z + Z, X))); \ + else \ + data = convert_##DST_FLT##4(READ_IMAGE_IN(src_data, smp_zero, (int2)(Z, X * size.y + Y))); \ + int offset = (X * shape.z + Y) * shape.w + Z * 4; \ + __global DST_FLT *dst_addr = (__global DST_FLT *)dst_data; \ + dst_addr += offset; \ + if ((Z + 1) * 4 <= shape.w) { \ + ((__global DST_FLT##4 *)dst_addr)[0] = data; \ + } else { \ + if (shape.w - Z * 4 >= 1) { \ + dst_addr[0] = data.x; \ + } \ + if (shape.w - Z * 4 >= 2) { \ + dst_addr[1] = data.y; \ + } \ + if (shape.w - Z * 4 >= 3) { \ + dst_addr[2] = data.z; \ + } \ + } \ + } + +// IMG_to_BUF(src_dtype, dst_dtype, SRC_FLT, DST_FLT, READ_IMAGE_IN) +IMG_to_BUF(float32, float32, float, float, read_imagef); +IMG_to_BUF(float16, float32, half, float, read_imageh); +IMG_to_BUF(float16, float16, half, half, read_imageh); +IMG_to_BUF(int32, int32, float, float, read_imagef); +IMG_to_BUF(uint32, uint32, float, float, read_imagef); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index 3e22ce36d1..6aa355607e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -44,6 +44,19 @@ using mindspore::schema::PrimitiveType_Eltwise; namespace mindspore::kernel { int ArithmeticOpenCLKernel::CheckSpecs() { + for (auto &tensor : in_tensors_) { + if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) { + MS_LOG(ERROR) << "ArithmeticOpenCLKernel only support fp32/fp16 input"; + return RET_ERROR; + } + } + for (auto &tensor : out_tensors_) { + if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) { + MS_LOG(ERROR) << "ArithmeticOpenCLKernel only support fp32/fp16 output"; + return RET_ERROR; + } + } + if (in_tensors_.size() != 2 || out_tensors_.size() != 1) { MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); return RET_ERROR; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc index aa745b6613..d149a0ad11 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc @@ -286,6 +286,7 @@ void Conv2DOpenCLKernel::InitFilter() { } FreeDequantedWeight(); + FreeTmpWeight(in_tensors_.at(kWeightIndex)->data_c()); } void Conv2DOpenCLKernel::InitBias() { @@ -322,6 +323,7 @@ void Conv2DOpenCLKernel::InitBias() { } } allocator->UnmapBuffer(packed_bias_); + FreeTmpWeight(in_tensors_.at(kBiasIndex)->data_c()); } void Conv2DOpenCLKernel::SetConstArgs() { @@ -480,11 +482,9 @@ kernel::LiteKernel *OpenCLConv2DCreator(const std::vector &input MS_ASSERT(!inputs.empty()); MS_ASSERT(!outputs.empty()); MS_ASSERT(opParameter); - MS_ASSERT(inputs.front()->shape().size() == 4); - MS_ASSERT(outputs.front()->shape().size() == 4); auto *conv_param = reinterpret_cast(opParameter); - int input_channel = inputs.front()->shape().at(3); - int output_channel = outputs.front()->shape().at(3); + int input_channel = conv_param->input_channel_; + int output_channel = conv_param->output_channel_; int group = conv_param->group_; // case 1: depthwise conv2d @@ -529,6 +529,10 @@ kernel::LiteKernel *OpenCLConv2DCreator(const std::vector &input } } if (!infer_shape_done) { + StoreTmpWeight(inputs.at(kWeightIndex)); + if (inputs.size() > kBiasIndex) { + StoreTmpWeight(inputs.at(kBiasIndex)); + } MS_LOG(WARNING) << "kernel don't infer shape yet!"; return kernel; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc index 7e2911dbf9..0f8eb1b94d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc @@ -97,10 +97,11 @@ int ReshapeOpenCLKernel::Run() { } int ReshapeOpenCLKernel::PreProcess() { - if (Type() == PrimitiveType_Reshape && !infer_shape_flag_) { + if (Type() == PrimitiveType_Reshape && !op_parameter_->infer_flag_) { auto shape_tensor = in_tensors_[1]; if (!shape_tensor->IsConst()) { ocl_runtime_->SyncCommandQueue(); + shape_tensor->MutableData(); } } return OpenCLKernel::PreProcess(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc index a1ec824b35..4e0e89b840 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc @@ -33,7 +33,7 @@ using mindspore::schema::PrimitiveType_Resize; namespace mindspore::kernel { int ResizeOpenCLKernel::CheckSpecs() { - if (in_tensors_.size() != 1 || out_tensors_.size() != 1) { + if (in_tensors_.size() != 2 || out_tensors_.size() != 1) { MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); return RET_ERROR; } @@ -119,6 +119,17 @@ int ResizeOpenCLKernel::Run() { return RET_OK; } +int ResizeOpenCLKernel::PreProcess() { + if (Type() == PrimitiveType_Resize && !op_parameter_->infer_flag_) { + auto shape_tensor = in_tensors_[1]; + if (!shape_tensor->IsConst()) { + ocl_runtime_->SyncCommandQueue(); + shape_tensor->MutableData(); + } + } + return OpenCLKernel::PreProcess(); +} + REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Resize, OpenCLKernelCreator) REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Resize, OpenCLKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.h index 529cfc285b..dfda6eef48 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.h @@ -34,6 +34,7 @@ class ResizeOpenCLKernel : public OpenCLKernel { int CheckSpecs() override; void SetConstArgs() override; void SetGlobalLocal() override; + int PreProcess() override; private: float getResizeScaleFactor(int input_size, int output_size); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc index d48551d831..99e5472ef2 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.cc @@ -71,8 +71,20 @@ int StackOpenCLKernel::CheckSpecs() { MS_LOG(ERROR) << " only support input size = 2 and output size = 1"; return RET_ERROR; } + for (auto &tensor : in_tensors_) { + if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) { + MS_LOG(ERROR) << " only support fp32/fp16 input"; + return RET_ERROR; + } + } + for (auto &tensor : out_tensors_) { + if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) { + MS_LOG(ERROR) << " only support fp32/fp16 output"; + return RET_ERROR; + } + } if (in_tensors_[0]->shape().size() > 4 || in_tensors_[0]->shape().size() <= 0) { - MS_LOG(ERROR) << " only support dim <= 4 "; + MS_LOG(ERROR) << " only support 0shape().size() : axis_; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc index 2935580a05..b5aacf7842 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc @@ -64,32 +64,22 @@ void ToFormatOpenCLKernel::SetGlobalLocal() { } int ToFormatOpenCLKernel::Prepare() { - std::map dtype_str{ - {kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}, {kNumberTypeInt32, "float"}}; - std::string kernel_name; - if (out_mem_type_ == MemType::IMG) { - kernel_name = "to_format_NHWC_to_NHWC4_IMG_" + dtype_str[in_tensors_.front()->data_type()]; - } else { - kernel_name = "to_format_NHWC4_to_NHWC_BUF_" + dtype_str[out_tensors_.front()->data_type()]; - } + static std::map dtype_str{{kNumberTypeFloat32, "float32"}, + {kNumberTypeFloat16, "float16"}, + {kNumberTypeInt32, "int32"}, + {kNumberTypeUInt32, "uint32"}}; + auto in_tensor = in_tensors_.front(); + auto out_tensor = out_tensors_.front(); + std::string kernel_name = out_mem_type_ == MemType::IMG ? "BUF_to_IMG_" : "IMG_to_BUF_"; + kernel_name += dtype_str[in_tensor->data_type()] + "_" + dtype_str[out_tensor->data_type()]; this->set_name(kernel_name); -#ifdef PROGRAM_WITH_IL - kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); -#else std::string program_name = "to_format"; std::string source = to_format_source; ocl_runtime_->LoadSource(program_name, source); - std::vector ext_build_opt; - if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { - ext_build_opt.push_back("-DREAD_IMAGEIN=read_imagef"); - } else { - ext_build_opt.push_back("-DREAD_IMAGEIN=read_imageh"); - } - ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt); -#endif + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); - auto output = GpuTensorInfo(out_tensors_.front()); + auto output = GpuTensorInfo(out_tensor); N_ = output.N; H_ = output.H; W_ = output.W; @@ -112,15 +102,8 @@ int ToFormatOpenCLKernel::Run() { } int ToFormatOpenCLKernel::InferShape() { - if (infer_shape_flag_) { - return RET_OK; - } - if (in_tensors_[0]->shape().size() == 0 || in_tensors_[0]->ElementsNum() < 0) { - MS_LOG(ERROR) << "to_format op in tensor shape is 0, infer shape failed!"; - return RET_ERROR; - } out_tensors_[0]->set_shape(in_tensors_[0]->shape()); - infer_shape_flag_ = true; + op_parameter_->infer_flag_ = false; return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc index 5df0f21765..fee1040b4d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.cc @@ -97,7 +97,7 @@ int OpenCLKernel::GetImageSize(size_t idx, lite::opencl::ImageSize *img_size) { } void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) { - printf("%-30s", name().c_str()); + printf("%-30s ", name().c_str()); if (out_tensors().empty()) { return; } @@ -134,7 +134,9 @@ void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) { auto total_num = mem_type == lite::opencl::MemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num; for (int i = 0; i < print_num && i < total_num; ++i) { - if (tensor->data_type() == kNumberTypeFloat16) { + if (tensor->data_type() == kNumberTypeInt32) { + printf("%d %7d | ", i, reinterpret_cast(data.data())[i]); + } else if (tensor->data_type() == kNumberTypeFloat16) { printf("%d %7.3f | ", i, reinterpret_cast(data.data())[i]); } else { printf("%d %7.3f | ", i, reinterpret_cast(data.data())[i]); @@ -191,7 +193,7 @@ int OpenCLKernel::PostProcess() { } int OpenCLKernel::InferShape() { - if (infer_shape_flag_) { + if (op_parameter_->infer_flag_) { return RET_OK; } op_parameter_->infer_flag_ = true; @@ -202,12 +204,11 @@ int OpenCLKernel::InferShape() { op_parameter_->infer_flag_ = false; return ret; } - infer_shape_flag_ = true; return RET_OK; } int OpenCLKernel::ReSize() { - if (infer_shape_flag_) { + if (op_parameter_->infer_flag_) { return RET_OK; } auto ret = InferShape(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h index c0a0320725..eaa6b9364d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -27,6 +27,7 @@ #include "src/runtime/gpu/opencl/opencl_runtime.h" #include "mindspore/lite/src/dequant.h" #include "src/runtime/kernel/opencl/utils.h" +#include "nnacl/resize_parameter.h" using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; @@ -35,15 +36,15 @@ namespace mindspore::kernel { struct OpenCLToFormatParameter { OpParameter op_parameter{}; - schema::Format src_format{schema::Format::Format_NHWC}; - schema::Format dst_format{schema::Format::Format_NHWC4}; lite::opencl::MemType out_mem_type{lite::opencl::MemType::IMG}; }; template void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) { MS_ASSERT(dst); - MS_ASSERT(src); + if (src == nullptr || src_num <= 0) { + return; + } auto *N = dst; auto *H = dst + 1; auto *W = dst + 2; @@ -70,10 +71,12 @@ void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) { template void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num, DstT default_value) { MS_ASSERT(dst); - MS_ASSERT(src); for (int i = 0; i < 4; ++i) { dst[i] = default_value; } + if (src == nullptr || src_num <= 0) { + return; + } Broadcast2GpuShape(dst, src, src_num); } @@ -92,6 +95,10 @@ struct GpuTensorInfo { H = shape.s[1]; W = shape.s[2]; C = shape.s[3]; + MS_ASSERT(N > 0); + MS_ASSERT(H > 0); + MS_ASSERT(W > 0); + MS_ASSERT(C > 0); Slice = UP_DIV(C, C4NUM); FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float); @@ -167,7 +174,6 @@ class OpenCLKernel : public LiteKernel { const std::vector &outputs, const lite::InnerContext *ctx) : LiteKernel(parameter, inputs, outputs, ctx) { ocl_runtime_ = ocl_runtime_wrap_.GetInstance(); - infer_shape_flag_ = parameter->infer_flag_; } ~OpenCLKernel() override = default; int AlignGlobalLocal(const std::vector &global, const std::vector &local); @@ -199,8 +205,6 @@ class OpenCLKernel : public LiteKernel { int DequantWeight(); void FreeDequantedWeight(); virtual int InferShape(); - bool GetInferShapeFlag() { return infer_shape_flag_; } - void SetInferShapeFlag(bool flag) { infer_shape_flag_ = flag; } protected: static std::set GenerateLocalByGlobal(size_t global_i); @@ -225,7 +229,6 @@ class OpenCLKernel : public LiteKernel { cl::Event event_; void *restore_quant_data_{nullptr}; bool dequant_flag_{false}; - bool infer_shape_flag_{false}; private: lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; @@ -241,7 +244,7 @@ kernel::LiteKernel *OpenCLKernelCreator(const std::vector &input free(opParameter); return nullptr; } - if (!reinterpret_cast(kernel)->GetInferShapeFlag()) { + if (!opParameter->infer_flag_) { MS_LOG(WARNING) << "kernel don't infer shape yet!"; return kernel; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc index d8bddb295c..67c662555e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc @@ -121,8 +121,6 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector &in_tensors, for (size_t i = 0; i < in_tensors.size(); ++i) { auto *in_tensor = in_tensors.at(i); - auto dst_format = (mem_type == MemType::IMG) ? schema::Format::Format_NHWC4 : schema::Format::Format_NHWC; - auto src_format = (mem_type == MemType::IMG) ? schema::Format::Format_NHWC : schema::Format::Format_NHWC4; auto *new_tensor = new (std::nothrow) lite::Tensor(in_tensor->data_type(), in_tensor->shape(), in_tensor->format(), lite::Tensor::VAR); MS_ASSERT(new_tensor); @@ -130,20 +128,9 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector &in_tensors, MS_LOG(ERROR) << "OpenCLSubGraph new tensor failed!"; return RET_ERROR; } - if (mem_type == MemType::IMG) { - new_tensor->set_format(dst_format); - in_tensor->set_format(src_format); - } else { - new_tensor->set_format(src_format); - in_tensor->set_format(dst_format); - } out_tensors->emplace_back(new_tensor); KernelKey desc{kGPU, kNumberTypeFloat32, PRIM_TO_FORMAT}; - if (mem_type == MemType::IMG && ocl_runtime_->GetFp16Enable()) { - desc.data_type = kNumberTypeFloat16; - new_tensor->set_data_type(kNumberTypeFloat16); - } auto *parameter = static_cast(malloc(sizeof(OpenCLToFormatParameter))); MS_ASSERT(parameter); if (parameter == nullptr) { @@ -153,16 +140,7 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector &in_tensors, return RET_ERROR; } parameter->op_parameter.type_ = PRIM_TO_FORMAT; - bool output_shape_setted = true; - for (auto output : *out_tensors) { - if (output->shape().empty() || output->ElementsNum() < 0) { - output_shape_setted = false; - break; - } - } - parameter->op_parameter.infer_flag_ = output_shape_setted; - parameter->src_format = src_format; - parameter->dst_format = dst_format; + parameter->op_parameter.infer_flag_ = false; parameter->out_mem_type = mem_type; out_parameters->emplace_back(parameter); LiteKernel *in_convert_op = nullptr; @@ -255,8 +233,7 @@ int OpenCLSubGraph::Init() { int OpenCLSubGraph::UpdateTensorDataTypePass() { bool is_fp16 = ocl_runtime_->GetFp16Enable(); - MS_ASSERT(in_tensors_[0]); - if (is_fp16 && (in_tensors_[0]->data_type() == kNumberTypeFloat32)) { + if (is_fp16) { std::set out_set; out_set.insert(in_tensors_.begin(), in_tensors_.end()); out_set.insert(out_tensors_.begin(), out_tensors_.end()); @@ -330,16 +307,6 @@ void OpenCLSubGraph::GetInOutNodes() { } } -bool OpenCLSubGraph::IsSubGraphInferShapeDone() { - for (auto node : this->nodes_) { - auto opencl_kernel = reinterpret_cast(node); - if (!opencl_kernel->GetInferShapeFlag()) { - return false; - } - } - return true; -} - int OpenCLSubGraph::Prepare() { for (const auto tensor : in_tensors_) { MS_ASSERT(tensor); @@ -354,7 +321,6 @@ int OpenCLSubGraph::Prepare() { MS_LOG(ERROR) << "Create OpenCLExecutor fail"; return RET_ERROR; } - auto ret = RET_OK; for (auto node : this->nodes_) { if (node == nullptr) { MS_LOG(ERROR) << "node in Subgraph is nullptr"; @@ -363,26 +329,28 @@ int OpenCLSubGraph::Prepare() { auto opencl_kernel = reinterpret_cast(node); std::set pre_init_weight_list = {schema::PrimitiveType_MatMul, schema::PrimitiveType_BiasAdd}; if (pre_init_weight_list.find(opencl_kernel->Type()) != pre_init_weight_list.end()) { - ret = opencl_kernel->InitWeights(); + auto ret = opencl_kernel->InitWeights(); if (ret != RET_OK) { MS_LOG(ERROR) << "init weights " << node->name() << " failed"; return ret; } } - if (opencl_kernel->GetInferShapeFlag()) { - ret = node->Prepare(); + if (opencl_kernel->op_parameter()->infer_flag_) { + auto ret = node->Prepare(); if (ret != RET_OK) { MS_LOG(ERROR) << "prepare node " << node->name() << " failed"; return ret; } } } - auto opencl_exec = reinterpret_cast(executor_); - // If tuning_mode is DEFAULT, just malloc memory for reuse. - ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Run opencl executor failed: " << ret; - return ret; + if (all_kernels_infer_done_) { + auto opencl_exec = reinterpret_cast(executor_); + // If tuning_mode is DEFAULT, just malloc memory for reuse. + auto ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Run opencl executor failed: " << ret; + return ret; + } } return RET_OK; } @@ -423,7 +391,7 @@ int OpenCLSubGraph::ReSize(bool interrupt) { for (auto &output : outputs) { output->FreeData(); } - opencl_kernel->SetInferShapeFlag(false); + opencl_kernel->op_parameter()->infer_flag_ = false; } for (auto kernel : nodes_) { auto opencl_kernel = reinterpret_cast(kernel); diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h index 8bdb24c00c..01933cefad 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h @@ -36,6 +36,9 @@ class OpenCLSubGraph : public SubGraphKernel { subgraph_type_ = kGpuSubGraph; this->name_ = "GpuSubGraph"; nodes_set_.insert(nodes.begin(), nodes.end()); + all_kernels_infer_done_ = std::all_of(nodes_.begin(), nodes_.end(), [](const kernel::LiteKernel *kernel) { + return kernel && kernel->op_parameter() && kernel->op_parameter()->infer_flag_; + }); } ~OpenCLSubGraph() override; @@ -48,7 +51,6 @@ class OpenCLSubGraph : public SubGraphKernel { int Run() override; int Run(const KernelCallBack &before, const KernelCallBack &after) override; int InsertOpsPass(); - bool IsSubGraphInferShapeDone(); private: void UnInit(); @@ -83,6 +85,7 @@ class OpenCLSubGraph : public SubGraphKernel { std::set nodes_set_; lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr}; + bool all_kernels_infer_done_ = false; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.cc b/mindspore/lite/src/runtime/kernel/opencl/utils.cc index a3fd9dfcdd..fdbde0be1f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc @@ -296,4 +296,27 @@ int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tens return RET_OK; } +static std::set tmp_weights; + +void StoreTmpWeight(lite::Tensor *tensor) { + MS_LOG(WARNING) << "store weight when kernel don't infer shape!"; + if (tensor && tensor->data_c() && tensor->Size()) { + void *new_data = malloc(tensor->Size()); + MS_ASSERT(new_data); + if (new_data == nullptr) { + return; + } + memcpy(new_data, tensor->data_c(), tensor->Size()); + tensor->set_data(new_data); + tmp_weights.insert(new_data); + } +} + +void FreeTmpWeight(void *data) { + if (tmp_weights.count(data)) { + free(data); + tmp_weights.erase(data); + } +} + } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h index 7e4ce5822c..6a8dcaad27 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.h +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h @@ -63,6 +63,9 @@ void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, c int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tensor_name, lite::Tensor *tensor, TypeId expect_data_type, const std::vector &expect_shape); +void StoreTmpWeight(lite::Tensor *tensor); +void FreeTmpWeight(void *tensor); + template void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane_in, int plane_out, int channel, const std::function &to_dtype) { diff --git a/mindspore/lite/src/tensor.cc b/mindspore/lite/src/tensor.cc index 523d6f696f..1488699d27 100644 --- a/mindspore/lite/src/tensor.cc +++ b/mindspore/lite/src/tensor.cc @@ -224,7 +224,7 @@ int32_t Tensor::ElementsC4Num() const { if (this->category_ == CONST_SCALAR) { return 1; } - int32_t result = 0; + int32_t result = 1; if (this->shape_.size() == 4) { result = Batch() * Height() * Width() * ((Channel() + 3) / 4 * 4); } else if (this->shape_.size() == 2) {