| @@ -68,12 +68,14 @@ void *OpenCLAllocator::MinimumFit(MemType mem_type, size_t size, const ImageSize | |||||
| void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer) { | void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer) { | ||||
| cl_int ret = CL_SUCCESS; | cl_int ret = CL_SUCCESS; | ||||
| MS_ASSERT(buffer); | MS_ASSERT(buffer); | ||||
| MS_ASSERT(size > 0); | |||||
| *buffer = new (std::nothrow) cl::Buffer(*ocl_runtime_->Context(), static_cast<cl_mem_flags>(flags), size, data, &ret); | *buffer = new (std::nothrow) cl::Buffer(*ocl_runtime_->Context(), static_cast<cl_mem_flags>(flags), size, data, &ret); | ||||
| if (*buffer == nullptr) { | if (*buffer == nullptr) { | ||||
| MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")"; | MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")"; | ||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| void *host_ptr = ocl_runtime_->MapBuffer(**buffer, CL_MAP_READ | CL_MAP_WRITE, size); | void *host_ptr = ocl_runtime_->MapBuffer(**buffer, CL_MAP_READ | CL_MAP_WRITE, size); | ||||
| MS_ASSERT(host_ptr); | |||||
| if (host_ptr == nullptr) { | if (host_ptr == nullptr) { | ||||
| delete *buffer; | delete *buffer; | ||||
| MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << *buffer << ", host_ptr=" << host_ptr; | MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << *buffer << ", host_ptr=" << host_ptr; | ||||
| @@ -1,450 +1,80 @@ | |||||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | #pragma OPENCL EXTENSION cl_khr_fp16 : enable | ||||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | ||||
| __kernel void to_format_NHWC_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| FLT4 data = (FLT4)(0.f); | |||||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; | |||||
| __global float *src_addr = (__global float *)src_data; | |||||
| src_addr += offset; | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| data = TO_FLT4(((__global float4 *)src_addr)[0]); | |||||
| } else { | |||||
| if ((shape.w - Z * 4) >= 1) { | |||||
| data.x = (FLT)src_addr[0]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 2) { | |||||
| data.y = (FLT)src_addr[1]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 3) { | |||||
| data.z = (FLT)src_addr[2]; | |||||
| } | |||||
| } | |||||
| if (size.y * size.z <= MAX_IMAGE2D_WIDTH) | |||||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); | |||||
| else | |||||
| WRITE_IMAGE(dst_data, (int2)(Z, X * size.y + Y), data); | |||||
| } | |||||
| __kernel void to_format_NHWC_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| FLT4 data = (FLT4)(0.f); | |||||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; | |||||
| __global half *src_addr = (__global half *)src_data; | |||||
| src_addr += offset; | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| data = TO_FLT4(((__global half4 *)src_addr)[0]); | |||||
| } else { | |||||
| if ((shape.w - Z * 4) >= 1) { | |||||
| data.x = (FLT)src_addr[0]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 2) { | |||||
| data.y = (FLT)src_addr[1]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 3) { | |||||
| data.z = (FLT)src_addr[2]; | |||||
| } | |||||
| } | |||||
| if (size.y * size.z <= MAX_IMAGE2D_WIDTH) | |||||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); | |||||
| else | |||||
| WRITE_IMAGE(dst_data, (int2)(Z, X * size.y + Y), data); | |||||
| } | |||||
| __kernel void to_format_NCHW_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| FLT4 data = (FLT4)(0.f); | |||||
| __global float *src_addr = (__global float *)src_data; | |||||
| __global float *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; | |||||
| __global float *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; | |||||
| __global float *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| data = TO_FLT4(((__global float4 *)src_addr_0)[0]); | |||||
| } else { | |||||
| if ((shape.w - Z * 4) >= 1) { | |||||
| data.x = src_addr_0[0]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 2) { | |||||
| data.y = src_addr_1[0]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 3) { | |||||
| data.z = src_addr_2[0]; | |||||
| } | |||||
| } | |||||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); | |||||
| } | |||||
| __kernel void to_format_NCHW_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| FLT4 data = (FLT4)(0.f); | |||||
| __global half *src_addr = (__global half *)src_data; | |||||
| __global half *src_addr_0 = src_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; | |||||
| __global half *src_addr_1 = src_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; | |||||
| __global half *src_addr_2 = src_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| data = TO_FLT4(((__global half4 *)src_addr_0)[0]); | |||||
| } else { | |||||
| if ((shape.w - Z * 4) >= 1) { | |||||
| data.x = src_addr_0[0]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 2) { | |||||
| data.y = src_addr_1[0]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 3) { | |||||
| data.z = src_addr_2[0]; | |||||
| } | |||||
| } | |||||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data); | |||||
| } | |||||
| __kernel void to_format_NHWC_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z || shape.y == 0) { | |||||
| return; | |||||
| } | |||||
| int offset = (X / shape.y) * shape.y * shape.z * shape.w + ((X % shape.y) * shape.z + Y) * shape.w + Z * 4; | |||||
| __global float *src_addr = (__global float *)src_data; | |||||
| src_addr += offset; | |||||
| FLT4 data = (FLT4)(0.f); | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| data = TO_FLT4(((__global float4 *)src_addr)[0]); | |||||
| } else { | |||||
| if ((shape.w - Z * 4) >= 1) { | |||||
| data.x = (FLT)src_addr[0]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 2) { | |||||
| data.y = (FLT)src_addr[1]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 3) { | |||||
| data.z = (FLT)src_addr[2]; | |||||
| } | |||||
| } | |||||
| int pos_ix = (X / shape.y) * size.z * shape.y + Z * shape.y + X % shape.y; | |||||
| WRITE_IMAGE(dst_data, (int2)(Y, pos_ix), data); | |||||
| } | |||||
| __kernel void to_format_NHWC_to_NC4HW4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z || shape.y == 0) { | |||||
| return; | |||||
| } | |||||
| int offset = (X / shape.y) * shape.y * shape.z * shape.w + ((X % shape.y) * shape.z + Y) * shape.w + Z * 4; | |||||
| __global half *src_addr = (__global half *)src_data; | |||||
| src_addr += offset; | |||||
| FLT4 data = (FLT4)(0.f); | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| data = TO_FLT4(((__global half4 *)src_addr)[0]); | |||||
| } else { | |||||
| if ((shape.w - Z * 4) >= 1) { | |||||
| data.x = (FLT)src_addr[0]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 2) { | |||||
| data.y = (FLT)src_addr[1]; | |||||
| } | |||||
| if ((shape.w - Z * 4) >= 3) { | |||||
| data.z = (FLT)src_addr[2]; | |||||
| } | |||||
| } | |||||
| int pos_ix = (X / shape.y) * size.z * shape.y + Z * shape.y + X % shape.y; | |||||
| WRITE_IMAGE(dst_data, (int2)(Y, pos_ix), data); | |||||
| } | |||||
| __kernel void to_format_NHWC4_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), TO_FLT4(src_data[(X * size.y + Y) * size.z + Z])); | |||||
| } | |||||
| __kernel void to_format_NHWC4_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), TO_FLT4(src_data[(X * size.y + Y) * size.z + Z])); | |||||
| } | |||||
| __kernel void to_format_NC4HW4_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, | |||||
| int4 size, int4 shape) { | |||||
| // size(h, w, c4, 1), shape(n, c, h, w) | |||||
| int X = get_global_id(0); // h | |||||
| int Y = get_global_id(1); // w | |||||
| int Z = get_global_id(2); // c4 | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), TO_FLT4(src_data[(Z * size.x + X) * size.y + Y])); | |||||
| } | |||||
| __kernel void to_format_NC4HW4_to_NC4HW4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size, | |||||
| int4 shape) { | |||||
| // size(h, w, c4, 1), shape(n, c, h, w) | |||||
| int X = get_global_id(0); // h | |||||
| int Y = get_global_id(1); // w | |||||
| int Z = get_global_id(2); // c4 | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), TO_FLT4(src_data[(Z * size.x + X) * size.y + Y])); | |||||
| } | |||||
| __kernel void to_format_NCHW_to_NCHW_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| dst_data[(Z * size.y + Y) * size.x + X] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z))); | |||||
| } | |||||
| __kernel void to_format_NCHW_to_NCHW_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| dst_data[(Z * size.y + Y) * size.x + X] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z))); | |||||
| } | |||||
| __kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| float4 data; | |||||
| if (size.y * size.z <= MAX_IMAGE2D_WIDTH) | |||||
| data = convert_float4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| else | |||||
| data = convert_float4(READ_IMAGEIN(src_data, smp_zero, (int2)(Z, X * size.y + Y))); | |||||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; | |||||
| __global float *dst_addr = (__global float *)dst_data; | |||||
| dst_addr += offset; | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| ((__global float4 *)dst_addr)[0] = data; | |||||
| } else { | |||||
| if (shape.w - Z * 4 >= 1) { | |||||
| dst_addr[0] = data.x; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 2) { | |||||
| dst_addr[1] = data.y; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 3) { | |||||
| dst_addr[2] = data.z; | |||||
| } | |||||
| } | |||||
| } | |||||
| __kernel void to_format_NHWC4_to_NCHW_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; | |||||
| __global float *dst_addr = (__global float *)dst_data; | |||||
| __global float *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; | |||||
| __global float *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; | |||||
| __global float *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; | |||||
| dst_addr += offset; | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| ((__global float4 *)dst_addr_0)[0] = data; | |||||
| } else { | |||||
| if (shape.w - Z * 4 >= 1) { | |||||
| dst_addr_0[0] = data.x; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 2) { | |||||
| dst_addr_1[0] = data.y; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 3) { | |||||
| dst_addr_2[0] = data.z; | |||||
| } | |||||
| } | |||||
| } | |||||
| __kernel void to_format_NHWC4_to_NCHW_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; | |||||
| __global half *dst_addr = (__global half *)dst_data; | |||||
| __global half *dst_addr_0 = dst_addr + ((Z * 4 + 0) * shape.y + X) * shape.z + Y; | |||||
| __global half *dst_addr_1 = dst_addr + ((Z * 4 + 1) * shape.y + X) * shape.z + Y; | |||||
| __global half *dst_addr_2 = dst_addr + ((Z * 4 + 2) * shape.y + X) * shape.z + Y; | |||||
| dst_addr += offset; | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| ((__global half4 *)dst_addr_0)[0] = data; | |||||
| } else { | |||||
| if (shape.w - Z * 4 >= 1) { | |||||
| dst_addr_0[0] = data.x; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 2) { | |||||
| dst_addr_1[0] = data.y; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 3) { | |||||
| dst_addr_2[0] = data.z; | |||||
| } | |||||
| } | |||||
| } | |||||
| __kernel void to_format_NHWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| half4 data; | |||||
| if (size.y * size.z <= MAX_IMAGE2D_WIDTH) | |||||
| data = convert_half4(READ_IMAGEIN(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| else | |||||
| data = convert_half4(READ_IMAGEIN(src_data, smp_zero, (int2)(Z, X * size.y + Y))); | |||||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; | |||||
| __global half *dst_addr = (__global half *)dst_data; | |||||
| dst_addr += offset; | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| ((__global half4 *)dst_addr)[0] = data; | |||||
| } else { | |||||
| if (shape.w - Z * 4 >= 1) { | |||||
| dst_addr[0] = data.x; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 2) { | |||||
| dst_addr[1] = data.y; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 3) { | |||||
| dst_addr[2] = data.z; | |||||
| } | |||||
| } | |||||
| } | |||||
| __kernel void to_format_NC4HW4_to_NHWC_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z || shape.y == 0) { | |||||
| return; | |||||
| } | |||||
| int pos_ix = (X / shape.y) * size.z * shape.y + Z * shape.y + X % shape.y; | |||||
| float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y, pos_ix))); | |||||
| int offset = (X / shape.y) * shape.y * shape.z * shape.w + ((X % shape.y) * shape.z + Y) * shape.w + Z * 4; | |||||
| __global float *dst_addr = (__global float *)dst_data; | |||||
| dst_addr += offset; | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| ((__global float4 *)dst_addr)[0] = data; | |||||
| } else { | |||||
| if (shape.w - Z * 4 >= 1) { | |||||
| dst_addr[0] = data.x; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 2) { | |||||
| dst_addr[1] = data.y; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 3) { | |||||
| dst_addr[2] = data.z; | |||||
| } | |||||
| } | |||||
| } | |||||
| __kernel void to_format_NC4HW4_to_NHWC_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z || shape.y == 0) { | |||||
| return; | |||||
| } | |||||
| int pos_ix = (X / shape.y) * size.z * shape.y + Z * shape.y + X % shape.y; | |||||
| half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y, pos_ix))); | |||||
| int offset = (X / shape.y) * shape.y * shape.z * shape.w + ((X % shape.y) * shape.z + Y) * shape.w + Z * 4; | |||||
| __global half *dst_addr = (__global half *)dst_data; | |||||
| dst_addr += offset; | |||||
| if ((Z + 1) * 4 <= shape.w) { | |||||
| ((__global half4 *)dst_addr)[0] = data; | |||||
| } else { | |||||
| if (shape.w - Z * 4 >= 1) { | |||||
| dst_addr[0] = data.x; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 2) { | |||||
| dst_addr[1] = data.y; | |||||
| } | |||||
| if (shape.w - Z * 4 >= 3) { | |||||
| dst_addr[2] = data.z; | |||||
| } | |||||
| } | |||||
| } | |||||
| __kernel void to_format_NC4HW4_to_NC4HW4_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| // size(h, w, c, 1), shape(n, c, h, w) | |||||
| int X = get_global_id(0); // h | |||||
| int Y = get_global_id(1); // w | |||||
| int Z = get_global_id(2); // c | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| dst_data[(Z * size.x + X) * size.y + Y] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X))); | |||||
| } | |||||
| __kernel void to_format_NC4HW4_to_NC4HW4_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| // size(h, w, c, 1), shape(n, c, h, w) | |||||
| int X = get_global_id(0); // h | |||||
| int Y = get_global_id(1); // w | |||||
| int Z = get_global_id(2); // c | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| dst_data[(Z * size.x + X) * size.y + Y] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X))); | |||||
| } | |||||
| __kernel void to_format_NHWC4_to_NHWC4_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| dst_data[(X * size.y + Y) * size.z + Z] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| } | |||||
| __kernel void to_format_NHWC4_to_NHWC4_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, | |||||
| int4 shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| int Z = get_global_id(2); | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { | |||||
| return; | |||||
| } | |||||
| dst_data[(X * size.y + Y) * size.z + Z] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); | |||||
| } | |||||
| #define BUF_to_IMG(src_dtype, dst_dtype, SRC_FLT, DST_FLT, WRITE_IMAGE_OUT) \ | |||||
| __kernel void BUF_to_IMG_##src_dtype##_##dst_dtype(__global float4 *src_data, __write_only image2d_t dst_data, \ | |||||
| int4 size, int4 shape) { \ | |||||
| int X = get_global_id(0); \ | |||||
| int Y = get_global_id(1); \ | |||||
| int Z = get_global_id(2); \ | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { \ | |||||
| return; \ | |||||
| } \ | |||||
| DST_FLT##4 data = (DST_FLT##4)(0.f); \ | |||||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; \ | |||||
| __global SRC_FLT *src_addr = (__global SRC_FLT *)src_data; \ | |||||
| src_addr += offset; \ | |||||
| if ((Z + 1) * 4 <= shape.w) { \ | |||||
| data = convert_##DST_FLT##4(((__global SRC_FLT##4 *)src_addr)[0]); \ | |||||
| } else { \ | |||||
| if ((shape.w - Z * 4) >= 1) { \ | |||||
| data.x = (DST_FLT)src_addr[0]; \ | |||||
| } \ | |||||
| if ((shape.w - Z * 4) >= 2) { \ | |||||
| data.y = (DST_FLT)src_addr[1]; \ | |||||
| } \ | |||||
| if ((shape.w - Z * 4) >= 3) { \ | |||||
| data.z = (DST_FLT)src_addr[2]; \ | |||||
| } \ | |||||
| } \ | |||||
| if (size.y * size.z <= MAX_IMAGE2D_WIDTH) \ | |||||
| WRITE_IMAGE_OUT(dst_data, (int2)(Y * size.z + Z, X), data); \ | |||||
| else \ | |||||
| WRITE_IMAGE_OUT(dst_data, (int2)(Z, X * size.y + Y), data); \ | |||||
| } | |||||
| // BUF_to_IMG(src_dtype, dst_dtype, SRC_FLT, DST_FLT, WRITE_IMAGE_OUT) | |||||
| BUF_to_IMG(float32, float32, float, float, write_imagef); | |||||
| BUF_to_IMG(float32, float16, float, half, write_imageh); | |||||
| BUF_to_IMG(float16, float16, half, half, write_imageh); | |||||
| BUF_to_IMG(int32, int32, float, float, write_imagef); | |||||
| BUF_to_IMG(uint32, uint32, float, float, write_imagef); | |||||
| #define IMG_to_BUF(src_dtype, dst_dtype, SRC_FLT, DST_FLT, READ_IMAGE_IN) \ | |||||
| __kernel void IMG_to_BUF_##src_dtype##_##dst_dtype(__read_only image2d_t src_data, __global DST_FLT##4 * dst_data, \ | |||||
| int4 size, int4 shape) { \ | |||||
| int X = get_global_id(0); \ | |||||
| int Y = get_global_id(1); \ | |||||
| int Z = get_global_id(2); \ | |||||
| if (X >= size.x || Y >= size.y || Z >= size.z) { \ | |||||
| return; \ | |||||
| } \ | |||||
| DST_FLT##4 data; \ | |||||
| if (size.y * size.z <= MAX_IMAGE2D_WIDTH) \ | |||||
| data = convert_##DST_FLT##4(READ_IMAGE_IN(src_data, smp_zero, (int2)(Y * size.z + Z, X))); \ | |||||
| else \ | |||||
| data = convert_##DST_FLT##4(READ_IMAGE_IN(src_data, smp_zero, (int2)(Z, X * size.y + Y))); \ | |||||
| int offset = (X * shape.z + Y) * shape.w + Z * 4; \ | |||||
| __global DST_FLT *dst_addr = (__global DST_FLT *)dst_data; \ | |||||
| dst_addr += offset; \ | |||||
| if ((Z + 1) * 4 <= shape.w) { \ | |||||
| ((__global DST_FLT##4 *)dst_addr)[0] = data; \ | |||||
| } else { \ | |||||
| if (shape.w - Z * 4 >= 1) { \ | |||||
| dst_addr[0] = data.x; \ | |||||
| } \ | |||||
| if (shape.w - Z * 4 >= 2) { \ | |||||
| dst_addr[1] = data.y; \ | |||||
| } \ | |||||
| if (shape.w - Z * 4 >= 3) { \ | |||||
| dst_addr[2] = data.z; \ | |||||
| } \ | |||||
| } \ | |||||
| } | |||||
| // IMG_to_BUF(src_dtype, dst_dtype, SRC_FLT, DST_FLT, READ_IMAGE_IN) | |||||
| IMG_to_BUF(float32, float32, float, float, read_imagef); | |||||
| IMG_to_BUF(float16, float32, half, float, read_imageh); | |||||
| IMG_to_BUF(float16, float16, half, half, read_imageh); | |||||
| IMG_to_BUF(int32, int32, float, float, read_imagef); | |||||
| IMG_to_BUF(uint32, uint32, float, float, read_imagef); | |||||
| @@ -44,6 +44,19 @@ using mindspore::schema::PrimitiveType_Eltwise; | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| int ArithmeticOpenCLKernel::CheckSpecs() { | int ArithmeticOpenCLKernel::CheckSpecs() { | ||||
| for (auto &tensor : in_tensors_) { | |||||
| if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) { | |||||
| MS_LOG(ERROR) << "ArithmeticOpenCLKernel only support fp32/fp16 input"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | |||||
| for (auto &tensor : out_tensors_) { | |||||
| if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) { | |||||
| MS_LOG(ERROR) << "ArithmeticOpenCLKernel only support fp32/fp16 output"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | |||||
| if (in_tensors_.size() != 2 || out_tensors_.size() != 1) { | if (in_tensors_.size() != 2 || out_tensors_.size() != 1) { | ||||
| MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); | MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| @@ -286,6 +286,7 @@ void Conv2DOpenCLKernel::InitFilter() { | |||||
| } | } | ||||
| FreeDequantedWeight(); | FreeDequantedWeight(); | ||||
| FreeTmpWeight(in_tensors_.at(kWeightIndex)->data_c()); | |||||
| } | } | ||||
| void Conv2DOpenCLKernel::InitBias() { | void Conv2DOpenCLKernel::InitBias() { | ||||
| @@ -322,6 +323,7 @@ void Conv2DOpenCLKernel::InitBias() { | |||||
| } | } | ||||
| } | } | ||||
| allocator->UnmapBuffer(packed_bias_); | allocator->UnmapBuffer(packed_bias_); | ||||
| FreeTmpWeight(in_tensors_.at(kBiasIndex)->data_c()); | |||||
| } | } | ||||
| void Conv2DOpenCLKernel::SetConstArgs() { | void Conv2DOpenCLKernel::SetConstArgs() { | ||||
| @@ -480,11 +482,9 @@ kernel::LiteKernel *OpenCLConv2DCreator(const std::vector<lite::Tensor *> &input | |||||
| MS_ASSERT(!inputs.empty()); | MS_ASSERT(!inputs.empty()); | ||||
| MS_ASSERT(!outputs.empty()); | MS_ASSERT(!outputs.empty()); | ||||
| MS_ASSERT(opParameter); | MS_ASSERT(opParameter); | ||||
| MS_ASSERT(inputs.front()->shape().size() == 4); | |||||
| MS_ASSERT(outputs.front()->shape().size() == 4); | |||||
| auto *conv_param = reinterpret_cast<ConvParameter *>(opParameter); | auto *conv_param = reinterpret_cast<ConvParameter *>(opParameter); | ||||
| int input_channel = inputs.front()->shape().at(3); | |||||
| int output_channel = outputs.front()->shape().at(3); | |||||
| int input_channel = conv_param->input_channel_; | |||||
| int output_channel = conv_param->output_channel_; | |||||
| int group = conv_param->group_; | int group = conv_param->group_; | ||||
| // case 1: depthwise conv2d | // case 1: depthwise conv2d | ||||
| @@ -529,6 +529,10 @@ kernel::LiteKernel *OpenCLConv2DCreator(const std::vector<lite::Tensor *> &input | |||||
| } | } | ||||
| } | } | ||||
| if (!infer_shape_done) { | if (!infer_shape_done) { | ||||
| StoreTmpWeight(inputs.at(kWeightIndex)); | |||||
| if (inputs.size() > kBiasIndex) { | |||||
| StoreTmpWeight(inputs.at(kBiasIndex)); | |||||
| } | |||||
| MS_LOG(WARNING) << "kernel don't infer shape yet!"; | MS_LOG(WARNING) << "kernel don't infer shape yet!"; | ||||
| return kernel; | return kernel; | ||||
| } | } | ||||
| @@ -97,10 +97,11 @@ int ReshapeOpenCLKernel::Run() { | |||||
| } | } | ||||
| int ReshapeOpenCLKernel::PreProcess() { | int ReshapeOpenCLKernel::PreProcess() { | ||||
| if (Type() == PrimitiveType_Reshape && !infer_shape_flag_) { | |||||
| if (Type() == PrimitiveType_Reshape && !op_parameter_->infer_flag_) { | |||||
| auto shape_tensor = in_tensors_[1]; | auto shape_tensor = in_tensors_[1]; | ||||
| if (!shape_tensor->IsConst()) { | if (!shape_tensor->IsConst()) { | ||||
| ocl_runtime_->SyncCommandQueue(); | ocl_runtime_->SyncCommandQueue(); | ||||
| shape_tensor->MutableData(); | |||||
| } | } | ||||
| } | } | ||||
| return OpenCLKernel::PreProcess(); | return OpenCLKernel::PreProcess(); | ||||
| @@ -33,7 +33,7 @@ using mindspore::schema::PrimitiveType_Resize; | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| int ResizeOpenCLKernel::CheckSpecs() { | int ResizeOpenCLKernel::CheckSpecs() { | ||||
| if (in_tensors_.size() != 1 || out_tensors_.size() != 1) { | |||||
| if (in_tensors_.size() != 2 || out_tensors_.size() != 1) { | |||||
| MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); | MS_LOG(ERROR) << "in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| @@ -119,6 +119,17 @@ int ResizeOpenCLKernel::Run() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int ResizeOpenCLKernel::PreProcess() { | |||||
| if (Type() == PrimitiveType_Resize && !op_parameter_->infer_flag_) { | |||||
| auto shape_tensor = in_tensors_[1]; | |||||
| if (!shape_tensor->IsConst()) { | |||||
| ocl_runtime_->SyncCommandQueue(); | |||||
| shape_tensor->MutableData(); | |||||
| } | |||||
| } | |||||
| return OpenCLKernel::PreProcess(); | |||||
| } | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Resize, OpenCLKernelCreator<ResizeOpenCLKernel>) | REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Resize, OpenCLKernelCreator<ResizeOpenCLKernel>) | ||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Resize, OpenCLKernelCreator<ResizeOpenCLKernel>) | REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Resize, OpenCLKernelCreator<ResizeOpenCLKernel>) | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -34,6 +34,7 @@ class ResizeOpenCLKernel : public OpenCLKernel { | |||||
| int CheckSpecs() override; | int CheckSpecs() override; | ||||
| void SetConstArgs() override; | void SetConstArgs() override; | ||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| int PreProcess() override; | |||||
| private: | private: | ||||
| float getResizeScaleFactor(int input_size, int output_size); | float getResizeScaleFactor(int input_size, int output_size); | ||||
| @@ -71,8 +71,20 @@ int StackOpenCLKernel::CheckSpecs() { | |||||
| MS_LOG(ERROR) << " only support input size = 2 and output size = 1"; | MS_LOG(ERROR) << " only support input size = 2 and output size = 1"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| for (auto &tensor : in_tensors_) { | |||||
| if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) { | |||||
| MS_LOG(ERROR) << " only support fp32/fp16 input"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | |||||
| for (auto &tensor : out_tensors_) { | |||||
| if (tensor->data_type() != kNumberTypeFloat32 && tensor->data_type() != kNumberTypeFloat16) { | |||||
| MS_LOG(ERROR) << " only support fp32/fp16 output"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } | |||||
| if (in_tensors_[0]->shape().size() > 4 || in_tensors_[0]->shape().size() <= 0) { | if (in_tensors_[0]->shape().size() > 4 || in_tensors_[0]->shape().size() <= 0) { | ||||
| MS_LOG(ERROR) << " only support dim <= 4 "; | |||||
| MS_LOG(ERROR) << " only support 0<dim<=4"; | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| axis_ = axis_ < 0 ? axis_ + in_tensors_[0]->shape().size() : axis_; | axis_ = axis_ < 0 ? axis_ + in_tensors_[0]->shape().size() : axis_; | ||||
| @@ -64,32 +64,22 @@ void ToFormatOpenCLKernel::SetGlobalLocal() { | |||||
| } | } | ||||
| int ToFormatOpenCLKernel::Prepare() { | int ToFormatOpenCLKernel::Prepare() { | ||||
| std::map<TypeId, std::string> dtype_str{ | |||||
| {kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}, {kNumberTypeInt32, "float"}}; | |||||
| std::string kernel_name; | |||||
| if (out_mem_type_ == MemType::IMG) { | |||||
| kernel_name = "to_format_NHWC_to_NHWC4_IMG_" + dtype_str[in_tensors_.front()->data_type()]; | |||||
| } else { | |||||
| kernel_name = "to_format_NHWC4_to_NHWC_BUF_" + dtype_str[out_tensors_.front()->data_type()]; | |||||
| } | |||||
| static std::map<TypeId, std::string> dtype_str{{kNumberTypeFloat32, "float32"}, | |||||
| {kNumberTypeFloat16, "float16"}, | |||||
| {kNumberTypeInt32, "int32"}, | |||||
| {kNumberTypeUInt32, "uint32"}}; | |||||
| auto in_tensor = in_tensors_.front(); | |||||
| auto out_tensor = out_tensors_.front(); | |||||
| std::string kernel_name = out_mem_type_ == MemType::IMG ? "BUF_to_IMG_" : "IMG_to_BUF_"; | |||||
| kernel_name += dtype_str[in_tensor->data_type()] + "_" + dtype_str[out_tensor->data_type()]; | |||||
| this->set_name(kernel_name); | this->set_name(kernel_name); | ||||
| #ifdef PROGRAM_WITH_IL | |||||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | |||||
| #else | |||||
| std::string program_name = "to_format"; | std::string program_name = "to_format"; | ||||
| std::string source = to_format_source; | std::string source = to_format_source; | ||||
| ocl_runtime_->LoadSource(program_name, source); | ocl_runtime_->LoadSource(program_name, source); | ||||
| std::vector<std::string> ext_build_opt; | |||||
| if (in_tensors_[0]->data_type() == kNumberTypeFloat32) { | |||||
| ext_build_opt.push_back("-DREAD_IMAGEIN=read_imagef"); | |||||
| } else { | |||||
| ext_build_opt.push_back("-DREAD_IMAGEIN=read_imageh"); | |||||
| } | |||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, ext_build_opt); | |||||
| #endif | |||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | |||||
| auto output = GpuTensorInfo(out_tensors_.front()); | |||||
| auto output = GpuTensorInfo(out_tensor); | |||||
| N_ = output.N; | N_ = output.N; | ||||
| H_ = output.H; | H_ = output.H; | ||||
| W_ = output.W; | W_ = output.W; | ||||
| @@ -112,15 +102,8 @@ int ToFormatOpenCLKernel::Run() { | |||||
| } | } | ||||
| int ToFormatOpenCLKernel::InferShape() { | int ToFormatOpenCLKernel::InferShape() { | ||||
| if (infer_shape_flag_) { | |||||
| return RET_OK; | |||||
| } | |||||
| if (in_tensors_[0]->shape().size() == 0 || in_tensors_[0]->ElementsNum() < 0) { | |||||
| MS_LOG(ERROR) << "to_format op in tensor shape is 0, infer shape failed!"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| out_tensors_[0]->set_shape(in_tensors_[0]->shape()); | out_tensors_[0]->set_shape(in_tensors_[0]->shape()); | ||||
| infer_shape_flag_ = true; | |||||
| op_parameter_->infer_flag_ = false; | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -97,7 +97,7 @@ int OpenCLKernel::GetImageSize(size_t idx, lite::opencl::ImageSize *img_size) { | |||||
| } | } | ||||
| void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) { | void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) { | ||||
| printf("%-30s", name().c_str()); | |||||
| printf("%-30s ", name().c_str()); | |||||
| if (out_tensors().empty()) { | if (out_tensors().empty()) { | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -134,7 +134,9 @@ void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) { | |||||
| auto total_num = mem_type == lite::opencl::MemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num; | auto total_num = mem_type == lite::opencl::MemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num; | ||||
| for (int i = 0; i < print_num && i < total_num; ++i) { | for (int i = 0; i < print_num && i < total_num; ++i) { | ||||
| if (tensor->data_type() == kNumberTypeFloat16) { | |||||
| if (tensor->data_type() == kNumberTypeInt32) { | |||||
| printf("%d %7d | ", i, reinterpret_cast<int32_t *>(data.data())[i]); | |||||
| } else if (tensor->data_type() == kNumberTypeFloat16) { | |||||
| printf("%d %7.3f | ", i, reinterpret_cast<float16_t *>(data.data())[i]); | printf("%d %7.3f | ", i, reinterpret_cast<float16_t *>(data.data())[i]); | ||||
| } else { | } else { | ||||
| printf("%d %7.3f | ", i, reinterpret_cast<float *>(data.data())[i]); | printf("%d %7.3f | ", i, reinterpret_cast<float *>(data.data())[i]); | ||||
| @@ -191,7 +193,7 @@ int OpenCLKernel::PostProcess() { | |||||
| } | } | ||||
| int OpenCLKernel::InferShape() { | int OpenCLKernel::InferShape() { | ||||
| if (infer_shape_flag_) { | |||||
| if (op_parameter_->infer_flag_) { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| op_parameter_->infer_flag_ = true; | op_parameter_->infer_flag_ = true; | ||||
| @@ -202,12 +204,11 @@ int OpenCLKernel::InferShape() { | |||||
| op_parameter_->infer_flag_ = false; | op_parameter_->infer_flag_ = false; | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| infer_shape_flag_ = true; | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int OpenCLKernel::ReSize() { | int OpenCLKernel::ReSize() { | ||||
| if (infer_shape_flag_) { | |||||
| if (op_parameter_->infer_flag_) { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| auto ret = InferShape(); | auto ret = InferShape(); | ||||
| @@ -27,6 +27,7 @@ | |||||
| #include "src/runtime/gpu/opencl/opencl_runtime.h" | #include "src/runtime/gpu/opencl/opencl_runtime.h" | ||||
| #include "mindspore/lite/src/dequant.h" | #include "mindspore/lite/src/dequant.h" | ||||
| #include "src/runtime/kernel/opencl/utils.h" | #include "src/runtime/kernel/opencl/utils.h" | ||||
| #include "nnacl/resize_parameter.h" | |||||
| using mindspore::lite::RET_ERROR; | using mindspore::lite::RET_ERROR; | ||||
| using mindspore::lite::RET_OK; | using mindspore::lite::RET_OK; | ||||
| @@ -35,15 +36,15 @@ namespace mindspore::kernel { | |||||
| struct OpenCLToFormatParameter { | struct OpenCLToFormatParameter { | ||||
| OpParameter op_parameter{}; | OpParameter op_parameter{}; | ||||
| schema::Format src_format{schema::Format::Format_NHWC}; | |||||
| schema::Format dst_format{schema::Format::Format_NHWC4}; | |||||
| lite::opencl::MemType out_mem_type{lite::opencl::MemType::IMG}; | lite::opencl::MemType out_mem_type{lite::opencl::MemType::IMG}; | ||||
| }; | }; | ||||
| template <typename SrcT, typename DstT> | template <typename SrcT, typename DstT> | ||||
| void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) { | void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) { | ||||
| MS_ASSERT(dst); | MS_ASSERT(dst); | ||||
| MS_ASSERT(src); | |||||
| if (src == nullptr || src_num <= 0) { | |||||
| return; | |||||
| } | |||||
| auto *N = dst; | auto *N = dst; | ||||
| auto *H = dst + 1; | auto *H = dst + 1; | ||||
| auto *W = dst + 2; | auto *W = dst + 2; | ||||
| @@ -70,10 +71,12 @@ void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) { | |||||
| template <typename SrcT, typename DstT> | template <typename SrcT, typename DstT> | ||||
| void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num, DstT default_value) { | void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num, DstT default_value) { | ||||
| MS_ASSERT(dst); | MS_ASSERT(dst); | ||||
| MS_ASSERT(src); | |||||
| for (int i = 0; i < 4; ++i) { | for (int i = 0; i < 4; ++i) { | ||||
| dst[i] = default_value; | dst[i] = default_value; | ||||
| } | } | ||||
| if (src == nullptr || src_num <= 0) { | |||||
| return; | |||||
| } | |||||
| Broadcast2GpuShape(dst, src, src_num); | Broadcast2GpuShape(dst, src, src_num); | ||||
| } | } | ||||
| @@ -92,6 +95,10 @@ struct GpuTensorInfo { | |||||
| H = shape.s[1]; | H = shape.s[1]; | ||||
| W = shape.s[2]; | W = shape.s[2]; | ||||
| C = shape.s[3]; | C = shape.s[3]; | ||||
| MS_ASSERT(N > 0); | |||||
| MS_ASSERT(H > 0); | |||||
| MS_ASSERT(W > 0); | |||||
| MS_ASSERT(C > 0); | |||||
| Slice = UP_DIV(C, C4NUM); | Slice = UP_DIV(C, C4NUM); | ||||
| FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float); | FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float); | ||||
| @@ -167,7 +174,6 @@ class OpenCLKernel : public LiteKernel { | |||||
| const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx) | const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx) | ||||
| : LiteKernel(parameter, inputs, outputs, ctx) { | : LiteKernel(parameter, inputs, outputs, ctx) { | ||||
| ocl_runtime_ = ocl_runtime_wrap_.GetInstance(); | ocl_runtime_ = ocl_runtime_wrap_.GetInstance(); | ||||
| infer_shape_flag_ = parameter->infer_flag_; | |||||
| } | } | ||||
| ~OpenCLKernel() override = default; | ~OpenCLKernel() override = default; | ||||
| int AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local); | int AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local); | ||||
| @@ -199,8 +205,6 @@ class OpenCLKernel : public LiteKernel { | |||||
| int DequantWeight(); | int DequantWeight(); | ||||
| void FreeDequantedWeight(); | void FreeDequantedWeight(); | ||||
| virtual int InferShape(); | virtual int InferShape(); | ||||
| bool GetInferShapeFlag() { return infer_shape_flag_; } | |||||
| void SetInferShapeFlag(bool flag) { infer_shape_flag_ = flag; } | |||||
| protected: | protected: | ||||
| static std::set<size_t> GenerateLocalByGlobal(size_t global_i); | static std::set<size_t> GenerateLocalByGlobal(size_t global_i); | ||||
| @@ -225,7 +229,6 @@ class OpenCLKernel : public LiteKernel { | |||||
| cl::Event event_; | cl::Event event_; | ||||
| void *restore_quant_data_{nullptr}; | void *restore_quant_data_{nullptr}; | ||||
| bool dequant_flag_{false}; | bool dequant_flag_{false}; | ||||
| bool infer_shape_flag_{false}; | |||||
| private: | private: | ||||
| lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; | lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; | ||||
| @@ -241,7 +244,7 @@ kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &input | |||||
| free(opParameter); | free(opParameter); | ||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| if (!reinterpret_cast<kernel::OpenCLKernel *>(kernel)->GetInferShapeFlag()) { | |||||
| if (!opParameter->infer_flag_) { | |||||
| MS_LOG(WARNING) << "kernel don't infer shape yet!"; | MS_LOG(WARNING) << "kernel don't infer shape yet!"; | ||||
| return kernel; | return kernel; | ||||
| } | } | ||||
| @@ -121,8 +121,6 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors, | |||||
| for (size_t i = 0; i < in_tensors.size(); ++i) { | for (size_t i = 0; i < in_tensors.size(); ++i) { | ||||
| auto *in_tensor = in_tensors.at(i); | auto *in_tensor = in_tensors.at(i); | ||||
| auto dst_format = (mem_type == MemType::IMG) ? schema::Format::Format_NHWC4 : schema::Format::Format_NHWC; | |||||
| auto src_format = (mem_type == MemType::IMG) ? schema::Format::Format_NHWC : schema::Format::Format_NHWC4; | |||||
| auto *new_tensor = new (std::nothrow) | auto *new_tensor = new (std::nothrow) | ||||
| lite::Tensor(in_tensor->data_type(), in_tensor->shape(), in_tensor->format(), lite::Tensor::VAR); | lite::Tensor(in_tensor->data_type(), in_tensor->shape(), in_tensor->format(), lite::Tensor::VAR); | ||||
| MS_ASSERT(new_tensor); | MS_ASSERT(new_tensor); | ||||
| @@ -130,20 +128,9 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors, | |||||
| MS_LOG(ERROR) << "OpenCLSubGraph new tensor failed!"; | MS_LOG(ERROR) << "OpenCLSubGraph new tensor failed!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| if (mem_type == MemType::IMG) { | |||||
| new_tensor->set_format(dst_format); | |||||
| in_tensor->set_format(src_format); | |||||
| } else { | |||||
| new_tensor->set_format(src_format); | |||||
| in_tensor->set_format(dst_format); | |||||
| } | |||||
| out_tensors->emplace_back(new_tensor); | out_tensors->emplace_back(new_tensor); | ||||
| KernelKey desc{kGPU, kNumberTypeFloat32, PRIM_TO_FORMAT}; | KernelKey desc{kGPU, kNumberTypeFloat32, PRIM_TO_FORMAT}; | ||||
| if (mem_type == MemType::IMG && ocl_runtime_->GetFp16Enable()) { | |||||
| desc.data_type = kNumberTypeFloat16; | |||||
| new_tensor->set_data_type(kNumberTypeFloat16); | |||||
| } | |||||
| auto *parameter = static_cast<OpenCLToFormatParameter *>(malloc(sizeof(OpenCLToFormatParameter))); | auto *parameter = static_cast<OpenCLToFormatParameter *>(malloc(sizeof(OpenCLToFormatParameter))); | ||||
| MS_ASSERT(parameter); | MS_ASSERT(parameter); | ||||
| if (parameter == nullptr) { | if (parameter == nullptr) { | ||||
| @@ -153,16 +140,7 @@ int OpenCLSubGraph::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors, | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| parameter->op_parameter.type_ = PRIM_TO_FORMAT; | parameter->op_parameter.type_ = PRIM_TO_FORMAT; | ||||
| bool output_shape_setted = true; | |||||
| for (auto output : *out_tensors) { | |||||
| if (output->shape().empty() || output->ElementsNum() < 0) { | |||||
| output_shape_setted = false; | |||||
| break; | |||||
| } | |||||
| } | |||||
| parameter->op_parameter.infer_flag_ = output_shape_setted; | |||||
| parameter->src_format = src_format; | |||||
| parameter->dst_format = dst_format; | |||||
| parameter->op_parameter.infer_flag_ = false; | |||||
| parameter->out_mem_type = mem_type; | parameter->out_mem_type = mem_type; | ||||
| out_parameters->emplace_back(parameter); | out_parameters->emplace_back(parameter); | ||||
| LiteKernel *in_convert_op = nullptr; | LiteKernel *in_convert_op = nullptr; | ||||
| @@ -255,8 +233,7 @@ int OpenCLSubGraph::Init() { | |||||
| int OpenCLSubGraph::UpdateTensorDataTypePass() { | int OpenCLSubGraph::UpdateTensorDataTypePass() { | ||||
| bool is_fp16 = ocl_runtime_->GetFp16Enable(); | bool is_fp16 = ocl_runtime_->GetFp16Enable(); | ||||
| MS_ASSERT(in_tensors_[0]); | |||||
| if (is_fp16 && (in_tensors_[0]->data_type() == kNumberTypeFloat32)) { | |||||
| if (is_fp16) { | |||||
| std::set<lite::Tensor *> out_set; | std::set<lite::Tensor *> out_set; | ||||
| out_set.insert(in_tensors_.begin(), in_tensors_.end()); | out_set.insert(in_tensors_.begin(), in_tensors_.end()); | ||||
| out_set.insert(out_tensors_.begin(), out_tensors_.end()); | out_set.insert(out_tensors_.begin(), out_tensors_.end()); | ||||
| @@ -330,16 +307,6 @@ void OpenCLSubGraph::GetInOutNodes() { | |||||
| } | } | ||||
| } | } | ||||
| bool OpenCLSubGraph::IsSubGraphInferShapeDone() { | |||||
| for (auto node : this->nodes_) { | |||||
| auto opencl_kernel = reinterpret_cast<kernel::OpenCLKernel *>(node); | |||||
| if (!opencl_kernel->GetInferShapeFlag()) { | |||||
| return false; | |||||
| } | |||||
| } | |||||
| return true; | |||||
| } | |||||
| int OpenCLSubGraph::Prepare() { | int OpenCLSubGraph::Prepare() { | ||||
| for (const auto tensor : in_tensors_) { | for (const auto tensor : in_tensors_) { | ||||
| MS_ASSERT(tensor); | MS_ASSERT(tensor); | ||||
| @@ -354,7 +321,6 @@ int OpenCLSubGraph::Prepare() { | |||||
| MS_LOG(ERROR) << "Create OpenCLExecutor fail"; | MS_LOG(ERROR) << "Create OpenCLExecutor fail"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| auto ret = RET_OK; | |||||
| for (auto node : this->nodes_) { | for (auto node : this->nodes_) { | ||||
| if (node == nullptr) { | if (node == nullptr) { | ||||
| MS_LOG(ERROR) << "node in Subgraph is nullptr"; | MS_LOG(ERROR) << "node in Subgraph is nullptr"; | ||||
| @@ -363,26 +329,28 @@ int OpenCLSubGraph::Prepare() { | |||||
| auto opencl_kernel = reinterpret_cast<kernel::OpenCLKernel *>(node); | auto opencl_kernel = reinterpret_cast<kernel::OpenCLKernel *>(node); | ||||
| std::set<int> pre_init_weight_list = {schema::PrimitiveType_MatMul, schema::PrimitiveType_BiasAdd}; | std::set<int> pre_init_weight_list = {schema::PrimitiveType_MatMul, schema::PrimitiveType_BiasAdd}; | ||||
| if (pre_init_weight_list.find(opencl_kernel->Type()) != pre_init_weight_list.end()) { | if (pre_init_weight_list.find(opencl_kernel->Type()) != pre_init_weight_list.end()) { | ||||
| ret = opencl_kernel->InitWeights(); | |||||
| auto ret = opencl_kernel->InitWeights(); | |||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "init weights " << node->name() << " failed"; | MS_LOG(ERROR) << "init weights " << node->name() << " failed"; | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| } | } | ||||
| if (opencl_kernel->GetInferShapeFlag()) { | |||||
| ret = node->Prepare(); | |||||
| if (opencl_kernel->op_parameter()->infer_flag_) { | |||||
| auto ret = node->Prepare(); | |||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "prepare node " << node->name() << " failed"; | MS_LOG(ERROR) << "prepare node " << node->name() << " failed"; | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| auto opencl_exec = reinterpret_cast<lite::opencl::OpenCLExecutor *>(executor_); | |||||
| // If tuning_mode is DEFAULT, just malloc memory for reuse. | |||||
| ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Run opencl executor failed: " << ret; | |||||
| return ret; | |||||
| if (all_kernels_infer_done_) { | |||||
| auto opencl_exec = reinterpret_cast<lite::opencl::OpenCLExecutor *>(executor_); | |||||
| // If tuning_mode is DEFAULT, just malloc memory for reuse. | |||||
| auto ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Run opencl executor failed: " << ret; | |||||
| return ret; | |||||
| } | |||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -423,7 +391,7 @@ int OpenCLSubGraph::ReSize(bool interrupt) { | |||||
| for (auto &output : outputs) { | for (auto &output : outputs) { | ||||
| output->FreeData(); | output->FreeData(); | ||||
| } | } | ||||
| opencl_kernel->SetInferShapeFlag(false); | |||||
| opencl_kernel->op_parameter()->infer_flag_ = false; | |||||
| } | } | ||||
| for (auto kernel : nodes_) { | for (auto kernel : nodes_) { | ||||
| auto opencl_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel); | auto opencl_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel); | ||||
| @@ -36,6 +36,9 @@ class OpenCLSubGraph : public SubGraphKernel { | |||||
| subgraph_type_ = kGpuSubGraph; | subgraph_type_ = kGpuSubGraph; | ||||
| this->name_ = "GpuSubGraph"; | this->name_ = "GpuSubGraph"; | ||||
| nodes_set_.insert(nodes.begin(), nodes.end()); | nodes_set_.insert(nodes.begin(), nodes.end()); | ||||
| all_kernels_infer_done_ = std::all_of(nodes_.begin(), nodes_.end(), [](const kernel::LiteKernel *kernel) { | |||||
| return kernel && kernel->op_parameter() && kernel->op_parameter()->infer_flag_; | |||||
| }); | |||||
| } | } | ||||
| ~OpenCLSubGraph() override; | ~OpenCLSubGraph() override; | ||||
| @@ -48,7 +51,6 @@ class OpenCLSubGraph : public SubGraphKernel { | |||||
| int Run() override; | int Run() override; | ||||
| int Run(const KernelCallBack &before, const KernelCallBack &after) override; | int Run(const KernelCallBack &before, const KernelCallBack &after) override; | ||||
| int InsertOpsPass(); | int InsertOpsPass(); | ||||
| bool IsSubGraphInferShapeDone(); | |||||
| private: | private: | ||||
| void UnInit(); | void UnInit(); | ||||
| @@ -83,6 +85,7 @@ class OpenCLSubGraph : public SubGraphKernel { | |||||
| std::set<LiteKernel *> nodes_set_; | std::set<LiteKernel *> nodes_set_; | ||||
| lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; | lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; | ||||
| lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr}; | lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr}; | ||||
| bool all_kernels_infer_done_ = false; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -296,4 +296,27 @@ int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tens | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| static std::set<void *> tmp_weights; | |||||
| void StoreTmpWeight(lite::Tensor *tensor) { | |||||
| MS_LOG(WARNING) << "store weight when kernel don't infer shape!"; | |||||
| if (tensor && tensor->data_c() && tensor->Size()) { | |||||
| void *new_data = malloc(tensor->Size()); | |||||
| MS_ASSERT(new_data); | |||||
| if (new_data == nullptr) { | |||||
| return; | |||||
| } | |||||
| memcpy(new_data, tensor->data_c(), tensor->Size()); | |||||
| tensor->set_data(new_data); | |||||
| tmp_weights.insert(new_data); | |||||
| } | |||||
| } | |||||
| void FreeTmpWeight(void *data) { | |||||
| if (tmp_weights.count(data)) { | |||||
| free(data); | |||||
| tmp_weights.erase(data); | |||||
| } | |||||
| } | |||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -63,6 +63,9 @@ void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, c | |||||
| int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tensor_name, lite::Tensor *tensor, | int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tensor_name, lite::Tensor *tensor, | ||||
| TypeId expect_data_type, const std::vector<int> &expect_shape); | TypeId expect_data_type, const std::vector<int> &expect_shape); | ||||
| void StoreTmpWeight(lite::Tensor *tensor); | |||||
| void FreeTmpWeight(void *tensor); | |||||
| template <class T1, class T2> | template <class T1, class T2> | ||||
| void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane_in, int plane_out, int channel, | void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane_in, int plane_out, int channel, | ||||
| const std::function<T2(T1)> &to_dtype) { | const std::function<T2(T1)> &to_dtype) { | ||||
| @@ -224,7 +224,7 @@ int32_t Tensor::ElementsC4Num() const { | |||||
| if (this->category_ == CONST_SCALAR) { | if (this->category_ == CONST_SCALAR) { | ||||
| return 1; | return 1; | ||||
| } | } | ||||
| int32_t result = 0; | |||||
| int32_t result = 1; | |||||
| if (this->shape_.size() == 4) { | if (this->shape_.size() == 4) { | ||||
| result = Batch() * Height() * Width() * ((Channel() + 3) / 4 * 4); | result = Batch() * Height() * Width() * ((Channel() + 3) / 4 * 4); | ||||
| } else if (this->shape_.size() == 2) { | } else if (this->shape_.size() == 2) { | ||||