| @@ -1,4 +1,7 @@ | |||
| __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape, | |||
| #ifdef cl_khr_fp16 | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| #endif | |||
| __kernel void AvgPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape, | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(0); | |||
| @@ -10,10 +13,10 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, | |||
| return; | |||
| } | |||
| float4 r = (float4)(0.0f); | |||
| float window_size = 0.0f; | |||
| int xs = X * stride.x + padding.x; | |||
| int ys = Y * stride.y + padding.y; | |||
| FLT4 r = (FLT4)(0.0f); | |||
| FLT window_size = 0.0f; | |||
| int xs = X * stride.x - padding.x; | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| @@ -21,11 +24,11 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| bool outside = outside_x || y_c < 0 || y_c >= input_shape.y; | |||
| r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (float4)(0.0f); | |||
| r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (FLT4)(0.0f); | |||
| window_size += !outside ? 1.0f : 0.0f; | |||
| } | |||
| } | |||
| float4 result = convert_float4(r / window_size); | |||
| FLT4 result = TO_FLT4(r / window_size); | |||
| output[(output_shape.y * X + Y) * output_shape.w + Z] = result; | |||
| } | |||
| @@ -43,10 +46,10 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d | |||
| return; | |||
| } | |||
| float4 r = (float4)(0.0f); | |||
| float window_size = 0.0f; | |||
| int xs = X * stride.x + padding.x; | |||
| int ys = Y * stride.y + padding.y; | |||
| FLT4 r = (FLT4)(0.0f); | |||
| FLT window_size = 0.0f; | |||
| int xs = X * stride.x - padding.x; | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| @@ -54,10 +57,10 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; | |||
| r += read_imagef(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)); | |||
| r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (float4)(0.0f); | |||
| window_size += !outside ? 1.0f : 0.0f; | |||
| } | |||
| } | |||
| float4 result = convert_float4(r / window_size); | |||
| write_imagef(output, (int2)(Y * output_shape.w + Z, X), result); | |||
| FLT4 result = TO_FLT4(r / window_size); | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result); | |||
| } | |||
| @@ -1,4 +1,7 @@ | |||
| __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape, | |||
| #ifdef cl_khr_fp16 | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| #endif | |||
| __kernel void MaxPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape, | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(0); | |||
| @@ -10,9 +13,9 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, | |||
| return; | |||
| } | |||
| float4 maximum = (float4)(-10000.0f); | |||
| int xs = X * stride.x + padding.x; | |||
| int ys = Y * stride.y + padding.y; | |||
| FLT4 maximum = (FLT4)(-10000.0f); | |||
| int xs = X * stride.x - padding.x; | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| @@ -24,7 +27,7 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, | |||
| if (y_c < 0 || y_c >= input_shape.y) { | |||
| continue; | |||
| } | |||
| float4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z]; | |||
| FLT4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z]; | |||
| maximum = max(src, maximum); | |||
| } | |||
| } | |||
| @@ -45,18 +48,18 @@ __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d | |||
| return; | |||
| } | |||
| float4 maximum = (float4)(-10000.0f); | |||
| int xs = X * stride.x + padding.x; | |||
| int ys = Y * stride.y + padding.y; | |||
| FLT4 maximum = (FLT4)(-10000.0f); | |||
| int xs = X * stride.x - padding.x; | |||
| int ys = Y * stride.y - padding.y; | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| if (y_c < 0 || y_c >= input_shape.y) continue; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| if (x_c < 0 || x_c >= input_shape.x) continue; | |||
| float4 src = read_imagef(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c)); | |||
| FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c)); | |||
| maximum = max(src, maximum); | |||
| } | |||
| } | |||
| write_imagef(output, (int2)(Y * output_shape.w + Z, X), maximum); | |||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum); | |||
| } | |||
| @@ -58,14 +58,13 @@ int MatMulOpenCLKernel::Init() { | |||
| sizeCO = {co, UP_DIV(co, C4NUM)}; | |||
| PadWeight(); | |||
| in_ori_format_ = in_tensors_[0]->GetFormat(); | |||
| in_tensors_[0]->SetFormat(schema::Format_NHWC4); | |||
| out_ori_format_ = out_tensors_[0]->GetFormat(); | |||
| out_tensors_[0]->SetFormat(schema::Format_NHWC4); | |||
| if (out_tensors_[0]->shape().size() == 2) { | |||
| out_ori_format_ = schema::Format_NC; | |||
| out_tensors_[0]->SetFormat(schema::Format_NC4); | |||
| in_ori_format_ = schema::Format_NC; | |||
| in_tensors_[0]->SetFormat(schema::Format_NC4); | |||
| } else { | |||
| in_tensors_[0]->SetFormat(schema::Format_NHWC4); | |||
| out_tensors_[0]->SetFormat(schema::Format_NHWC4); | |||
| } | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return RET_OK; | |||
| @@ -60,7 +60,7 @@ int PoolingOpenCLKernel::Init() { | |||
| return RET_INVALID_OP_NAME; | |||
| } | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| enable_fp16_ = ocl_runtime->GetFp16Enable(); | |||
| #ifdef PROGRAM_WITH_IL | |||
| kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name); | |||
| #else | |||
| @@ -96,11 +96,10 @@ int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) | |||
| size_t im_dst_x, im_dst_y; | |||
| im_dst_x = out_tensors_[0]->Width() * CO4; | |||
| im_dst_y = out_tensors_[0]->Height(); | |||
| #ifdef ENABLE_FP16 | |||
| size_t img_dtype = CL_HALF_FLOAT; | |||
| #else | |||
| size_t img_dtype = CL_FLOAT; | |||
| #endif | |||
| if (enable_fp16_) { | |||
| img_dtype = CL_HALF_FLOAT; | |||
| } | |||
| img_size->clear(); | |||
| std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype}; | |||
| *img_size = vec; | |||
| @@ -161,5 +160,6 @@ kernel::LiteKernel *OpenCLPooling2dKernelCreator(const std::vector<lite::tensor: | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -44,6 +44,7 @@ class PoolingOpenCLKernel : public OpenCLKernel { | |||
| std::vector<size_t> InitGlobalSize() const; | |||
| PoolingParameter *parameter_; | |||
| cl::Kernel kernel_; | |||
| bool enable_fp16_{false}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -50,6 +50,7 @@ class OpenCLKernel : public LiteKernel { | |||
| } | |||
| OpenCLMemType GetMemType() { return out_mem_type_; } | |||
| void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; } | |||
| void SetFormatType(schema::Format format_type) { op_format_ = format_type; } | |||
| schema::Format GetInOriFormat() { return in_ori_format_; } | |||
| schema::Format GetOutOriFormat() { return out_ori_format_; } | |||
| @@ -57,6 +58,7 @@ class OpenCLKernel : public LiteKernel { | |||
| OpenCLMemType out_mem_type_{OpenCLMemType::IMG}; | |||
| schema::Format in_ori_format_{schema::Format_NHWC}; | |||
| schema::Format out_ori_format_{schema::Format_NHWC4}; | |||
| schema::Format op_format_{schema::Format_NC4HW4}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||